diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index b1bc858..0000000 Binary files a/.DS_Store and /dev/null differ diff --git a/.gitignore b/.gitignore index 283eae9..b95d9dc 100644 --- a/.gitignore +++ b/.gitignore @@ -50,4 +50,5 @@ coverage.xml *.mo # Sphinx documentation -docs/_build +/docs/_build/ + diff --git a/.readthedocs.yaml b/.readthedocs.yaml deleted file mode 100644 index 4c59f79..0000000 --- a/.readthedocs.yaml +++ /dev/null @@ -1,16 +0,0 @@ -version: 2 - -build: - os: ubuntu-22.04 - tools: - python: "3.12" - -sphinx: - configuration: docs/source/conf.py - fail_on_warning: false - -formats: [] - -python: - install: - - requirements: docs/requirements-docs.txt \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 78979c5..7a31c8d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,20 +1,20 @@ Changelog -===# Change Chinese to English comments -The following records all changes worth noting in the project, formatted based on [Keep a Changelog]. +=== +以下记录了项目中所有值得关注的变更内容,其格式基于[Keep a Changelog]。 -This project version follows [Semantic Versioning] and [PEP-440]. +本项目版本遵守[Semantic Versioning]和[PEP-440]。 [Unreleased] --- ### Added -- This records new content added +- 这里记录新添加的内容 ### Changed -- This records changed content +- 这里记录变更的内容 0.1.0 - 2025-08-12 --- ### Added -- Create project +- 创建项目 [Unreleased]: http://icode.baidu.com/repos/baidu/hac-aiacc/vllm-kunlun/merge/0.1.0...master diff --git a/LICENSE.txt b/LICENSE.txt deleted file mode 100644 index 261eeb9..0000000 --- a/LICENSE.txt +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/README.md b/README.md index 3396dd8..fc69f23 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,18 @@ ![vLLM Kunlun Logo](vllm_kunlun/patches/vLLM_Kunlun.jpg)

- Documentation | - slack | + Documentation | + Users Forum | + slack |

--- ## Latest News🔥 -- [2025/12] Initial release of vLLM Kunlun +- [2025/11] +- [2025/11] +- [2025/11] +- [2025/11] Initial release of vLLM Kunlun --- @@ -30,28 +34,107 @@ By utilizing the vLLM Kunlun plugin, popular open-source models, including Trans --- ## Supported Models +

Generaltive Models

- + - + - - - - - - - - @@ -61,7 +144,7 @@ By utilizing the vLLM Kunlun plugin, popular open-source models, including Trans - + @@ -69,53 +152,13 @@ By utilizing the vLLM Kunlun plugin, popular open-source models, including Trans - - - - - - - - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - - - -
ModelModel Support Quantization LoRA Piecewise Kunlun GraphNoteNote
Qwen2/2.5
Qwen3
Qwen3-Moe/CoderQwen3-Moe
QwQ-32B
LLama2/3/3.1
GLM-4.5/AirQwen3-Next
Qwen3next⚠️comming soon
Gpt oss⚠️comming soon
Deepseek v3/3.2⚠️comming soon
@@ -133,61 +176,13 @@ By utilizing the vLLM Kunlun plugin, popular open-source models, including Trans - Qianfan-VL + Qwen3-VL ✅ ✅ - - Qwen2.5VL - ✅ - - - ✅ - - - - InternVL2.5/3/3.5 - ✅ - - - ✅ - - - - InternVL3.5 - ✅ - - - ✅ - - - - InternS1 - ✅ - - - ✅ - - - - Qwen2.5 omini - ⚠️ - - - - comming soon - - - Qwen3vl - ⚠️ - - - - comming soon - @@ -207,17 +202,17 @@ Please use the following recommended versions to get started quickly: | Version | Release type | Doc | |----------|---------------|-----| -| v0.10.1.1 | Latest stable version | [QuickStart](https://vllm-kunlun.readthedocs.io/en/latest/quick_start.html) and [Installation](https://vllm-kunlun.readthedocs.io/en/latest/installation.html) for more details | +| v0.11.0 | Latest stable version | [QuickStart](./docs/_build/html/quick_start.html) and [Installation](./docs/_build/html/installation.html) for more details | --- ## Contributing -See [CONTRIBUTING](https://vllm-kunlun.readthedocs.io/en/latest/developer_guide/contribution/index.html) for more details, which is a step-by-step guide to help you set up the development environment, build, and test. +See [CONTRIBUTING]() for more details, which is a step-by-step guide to help you set up the development environment, build, and test. We welcome and value any contributions and collaborations: -- Open an [Issue](https://github.com/baidu/vLLM-Kunlun/issues) if you find a bug or have a feature request +- Open an [Issue]() if you find a bug or have a feature request ## License -Apache License 2.0, as found in the [LICENSE](https://github.com/baidu/vLLM-Kunlun/blob/main/LICENSE.txt) file. +Apache License 2.0, as found in the [LICENSE](./LICENSE) file. \ No newline at end of file diff --git a/ci.yml b/ci.yml new file mode 100644 index 0000000..abda4ed --- /dev/null +++ b/ci.yml @@ -0,0 +1,19 @@ +Global: + version: "2.0" + group_email: hac@baidu.com +Default: + profile: + - build +Profiles: + - profile: + name: build + mode: AGENT + environment: + image: DECK_STD_CENTOS7 + tools: + - python: 3.10.10 + build: + command: sh build.sh + excludeTools: [] + artifacts: + release: true \ No newline at end of file diff --git a/dockerfile/Dockerfile_vision b/dockerfile/Dockerfile_vision new file mode 100644 index 0000000..8e3669d --- /dev/null +++ b/dockerfile/Dockerfile_vision @@ -0,0 +1,8 @@ +ARG BASE_IMAGE=iregistry.baidu-int.com/hac_test/aiak-inference-llm:xpu_dev_202508030_v1 +FROM ${BASE_IMAGE} + +COPY vllm-kunlun /workspace/vllm-kunlun + +RUN bash /workspace/vllm-kunlun/dockerfile/install.sh + +WORKDIR /workspace \ No newline at end of file diff --git a/dockerfile/install.sh b/dockerfile/install.sh new file mode 100644 index 0000000..8fd7e8c --- /dev/null +++ b/dockerfile/install.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +set -exuo pipefail + +source /root/miniconda/etc/profile.d/conda.sh +conda activate python310_torch25_cuda +echo 'conda activate python310_torch25_cuda' >> ~/.bashrc +echo 'source /workspace/vllm-kunlun/setup_env.sh' >> ~/.bashrc + +#安装社区vllm +cd /workspace/vllm-kunlun +pip uninstall vllm -y +pip uninstall vllm-kunlun -y +pip install vllm==0.11.0 --no-build-isolation --no-deps --index-url https://pip.baidu-int.com/simple/ + +# +pip install -r /workspace/vllm-kunlun/requirements.txt + +#安装vllm-kunlun +python setup.py build +python setup.py install +cp vllm_kunlun/patches/eval_frame.py /root/miniconda/envs/python310_torch25_cuda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py + +#安装Kl3自定义torch 01130 +wget -O xpytorch-cp310-torch251-ubuntu2004-x64.run https://baidu-kunlun-public.su.bcebos.com/v1/baidu-kunlun-share/1130/xpytorch-cp310-torch251-ubuntu2004-x64.run?authorization=bce-auth-v1%2FALTAKypXxBzU7gg4Mk4K4c6OYR%2F2025-12-02T05%3A01%3A27Z%2F-1%2Fhost%2Ff3cf499234f82303891aed2bcb0628918e379a21e841a3fac6bd94afef491ff7 +bash xpytorch-cp310-torch251-ubuntu2004-x64.run +rm xpytorch-cp310-torch251-ubuntu2004-x64.run +#安装Klx3自定义算子库 01130 +pip uninstall xtorch_ops -y +pip install "https://baidu-kunlun-public.su.bcebos.com/v1/baidu-kunlun-share/1130/xtorch_ops-0.1.2209%2B6752ad20-cp310-cp310-linux_x86_64.whl?authorization=bce-auth-v1%2FALTAKypXxBzU7gg4Mk4K4c6OYR%2F2025-12-05T06%3A18%3A00Z%2F-1%2Fhost%2F14936c2b7e7c557c1400e4c467c79f7a9217374a7aa4a046711ac4d948f460cd" +#安装klx3自定义triton +pip install "https://cce-ai-models.bj.bcebos.com/v1/vllm-kunlun-0.11.0/triton-3.0.0%2Bb2cde523-cp310-cp310-linux_x86_64.whl?authorization=bce-auth-v1%2FALTAKxPW2jzoJUuFZmI19s3yry%2F2025-11-05T02%3A47%3A29Z%2F-1%2Fhost%2Fd8c95dbd06187a3140ca3e681e00c6941c30e14bb1d4112a0c8bc3c93e5c9c3f" +#安装AIAK自定义算子库 +pip install "https://cce-ai-models.bj.bcebos.com/v1/chenyili/xspeedgate_ops-0.0.0-cp310-cp310-linux_x86_64.whl?authorization=bce-auth-v1%2FALTAKxPW2jzoJUuFZmI19s3yry%2F2025-12-05T06%3A37%3A39Z%2F-1%2Fhost%2F1002777dadd2afe4c1f047cbf0d94244d5b1f03295cd8f7a2802b92a13cd5035" \ No newline at end of file diff --git a/docs/.DS_Store b/docs/.DS_Store index 125470d..67c4bad 100644 Binary files a/docs/.DS_Store and b/docs/.DS_Store differ diff --git a/docs/README.md b/docs/README.md index 3496a6d..78050a5 100644 --- a/docs/README.md +++ b/docs/README.md @@ -5,53 +5,52 @@ uv venv myenv --python 3.12 --seed source myenv/bin/activate - - # Step 1: Enter the docs directory +# 步骤1:进入docs目录 cd docs -# Step 2: Install dependencies (using uv) +# 步骤2:安装依赖(使用uv) uv pip install -r requirements-docs.txt -# Install sphinx-autobuild (if not in requirements file) +# 安装 sphinx-autobuild(如果没在 requirements 文件里) uv pip install sphinx-autobuild -# Run from the docs directory: +# 从 docs 目录运行: sphinx-autobuild ./source ./_build/html --port 8000 -# Step 1: Clean up old files +# 步骤1:清理旧文件 make clean -# Step 2: Build HTML +# 步骤2:构建HTML make html -# Step 3: Local preview +# 步骤3:本地预览 python -m http.server -d _build/html/ -Browser access: http://localhost:8000 +浏览器访问:http://localhost:8000 🌍 Internationalization -Internationalization translation process (taking Chinese as an example) +国际化翻译流程(以中文为例) -# Step 1: Extract translatable text (generate .pot) +# 步骤1:提取可翻译文本(生成 .pot) sphinx-build -b gettext source _build/gettext -# Step 2: Generate/update Chinese .po file +# 步骤2:生成/更新中文 .po 文件 sphinx-intl update -p _build/gettext -l zh_CN -# Step 3: Manually translate .po file -# Use a text editor to open source/locale/zh_CN/LC_MESSAGES/*.po -# Fill in the Chinese translation in msgstr "" +# 步骤3:人工翻译 .po 文件 +# 用文本编辑器打开 source/locale/zh_CN/LC_MESSAGES/*.po +# 在 msgstr "" 里填入中文翻译 -# Step 4: Compile and build Chinese documentation +# 步骤4:编译并构建中文文档 make intl -# Step 5: View the effect +# 步骤5:查看效果 python -m http.server -d _build/html -Browser access: +浏览器访问: -English version: http://localhost:8000 -Chinese version: http://localhost:8000/zh-cn +英文版: http://localhost:8000 +中文版: http://localhost:8000/zh-cn ``` diff --git a/docs/envs.py b/docs/envs.py index dc01993..0525e95 100644 --- a/docs/envs.py +++ b/docs/envs.py @@ -47,15 +47,18 @@ env_variables: Dict[str, Callable[[], Any]] = { # The C compiler used for compiling the package. If not set, the default # value is None, which means the system default C compiler will be used. "C_COMPILER": lambda: os.getenv("C_COMPILER", None), - - "SOC_VERSION": lambda: os.getenv("SOC_VERSION", "KUNLUNP800"), + # The version of the Kunlun chip. If not set, the default value is + # KUNLUN910B1(Available for A2 and A3 series). It's used for package building. + # Please make sure that the version is correct. + "SOC_VERSION": lambda: os.getenv("SOC_VERSION", "KUNLUN910B1"), # If set, vllm-kunlun will print verbose logs during compilation "VERBOSE": lambda: bool(int(os.getenv("VERBOSE", "0"))), + # The home path for CANN toolkit. If not set, the default value is # /usr/local/Kunlun/kunlun-toolkit/latest "KUNLUN_HOME_PATH": lambda: os.getenv("KUNLUN_HOME_PATH", None), - # The path for XCCL library, it's used by pyxccl communicator backend. If - # not set, the default value is libxccl.so。 - "XCCL_SO_PATH": lambda: os.environ.get("XCCL_SO_PATH", None), + # The path for HCCL library, it's used by pyhccl communicator backend. If + # not set, the default value is libhccl.so。 + "HCCL_SO_PATH": lambda: os.environ.get("HCCL_SO_PATH", None), # The version of vllm is installed. This value is used for developers who # installed vllm from source locally. In this case, the version of vllm is # usually changed. For example, if the version of vllm is "0.9.0", but when @@ -116,6 +119,7 @@ env_variables: Dict[str, Callable[[], Any]] = { # and the mla_pa will be the default path of deepseek decode path. "VLLM_KUNLUN_MLA_PA": lambda: int(os.getenv("VLLM_KUNLUN_MLA_PA", 0)), # Whether to enable MatmulAllReduce fusion kernel when tensor parallel is enabled. + # this feature is supported in A2, and eager mode will get better performance. "VLLM_KUNLUN_ENABLE_MATMUL_ALLREDUCE": lambda: bool( int(os.getenv("VLLM_KUNLUN_ENABLE_MATMUL_ALLREDUCE", "0")) ), diff --git a/docs/source/community/contributors.md b/docs/source/community/contributors.md index 437daa0..5f44922 100644 --- a/docs/source/community/contributors.md +++ b/docs/source/community/contributors.md @@ -35,5 +35,4 @@ | Yijin Qiao | | Chenchao Hu | | Weijie Hong | -| Song Jiang | -| Hongwei Ma | \ No newline at end of file +| Song Jiang | \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index cd2333e..dcdfb30 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -65,17 +65,19 @@ myst_substitutions = { # the branch of vllm, used in vllm clone # - main branch: 'main' # - vX.Y.Z branch: 'vX.Y.Z' - "vllm_version": "0.10.1.1", + "vllm_version": "v0.11.0rc3", # the branch of vllm-kunlun, used in vllm-kunlun clone and image tag # - main branch: 'main' # - vX.Y.Z branch: latest vllm-kunlun release tag - "vllm_kunlun_version": "0.10.1.1", + "vllm_kunlun_version": "v0.11.0rc0", # the newest release version of vllm-kunlun and matched vLLM, used in pip install. # This value should be updated when cut down release. - "pip_vllm_kunlun_version": "0.10.1.1", - "pip_vllm_version": "0.10.1.1", + "pip_vllm_kunlun_version": "0.11.0rc0", + "pip_vllm_version": "0.11.0", + # CANN image tag + "cann_image_tag": "8.3.rc1-910b-ubuntu22.04-py3.11", # vllm version in ci - "ci_vllm_version": "0.10.1.1", + "ci_vllm_version": "v0.11.0", } # For cross-file header anchors @@ -102,6 +104,7 @@ exclude_patterns = [ ".venv", "README.md", "user_guide/release.template.md", + # TODO(yikun): Remove this after zh supported "**/*.zh.md", ] @@ -115,7 +118,7 @@ html_theme = "sphinx_book_theme" html_logo = "logos/vllm-kunlun-logo-text-light.png" html_theme_options = { "path_to_docs": "docs/source", - "repository_url": "https://github.com/baidu/vLLM-Kunlun", + "repository_url": "https://github.com/xxxxx/vllm-kunlun", "use_repository_button": True, "use_edit_page_button": True, } diff --git a/docs/source/developer_guide/contribution/contributing.md b/docs/source/developer_guide/contribution/contributing.md new file mode 100644 index 0000000..dd8254a --- /dev/null +++ b/docs/source/developer_guide/contribution/contributing.md @@ -0,0 +1,83 @@ +# Contributing + +## Building and Testing +It's recommended to set up a local development environment to build vllm-kunlun and run tests +before you submit a PR. + +#### Run models locally + +After completing Run lint setup which is shown in quicksatrt, you can run your changed locally: + +```{code-block} bash + :substitutions: + +python -m vllm.entrypoints.openai.api_server \ + --host 0.0.0.0 \ + --port 8356 \ + --model your_modified_models \ + --gpu-memory-utilization 0.9 \ + --trust-remote-code \ + --max-model-len 32768 \ + --tensor-parallel-size 1 \ + --dtype float16 \ + --max_num_seqs 128 \ + --max_num_batched_tokens 32768 \ + --block-size 128 \ + --no-enable-prefix-caching \ + --no-enable-chunked-prefill \ + --distributed-executor-backend mp \ + --served-model-name your_modified_models \ + --compilation-config '{"splitting_ops": ["vllm.unified_attention", + "vllm.unified_attention_with_output", + "vllm.unified_attention_with_output_kunlun", + "vllm.mamba_mixer2", + "vllm.mamba_mixer", + "vllm.short_conv", + "vllm.linear_attention", + "vllm.plamo2_mamba_mixer", + "vllm.gdn_attention", + "vllm.sparse_attn_indexer"]}' \ +``` +Please save a screenshot of your service running successfully, and attach an accuracy report. + +#### Submit the commit + +```bash +# Commit changed files using `-s` +git commit -sm "your commit info" +``` + +🎉 Congratulations! You have completed the development environment setup. + + +## PR Title and Classification + +Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following: + +- `[Attention]` for new features or optimization in attention. +- `[Communicator]` for new features or optimization in communicators. +- `[ModelRunner]` for new features or optimization in model runner. +- `[Platform]` for new features or optimization in platform. +- `[Worker]` for new features or optimization in worker. +- `[Core]` for new features or optimization in the core vllm-kunlun logic (such as platform, attention, communicators, model runner) +- `[Kernel]` for changes affecting compute kernels and ops. +- `[Bugfix]` for bug fixes. +- `[Doc]` for documentation fixes and improvements. +- `[Test]` for tests (such as unit tests). +- `[CI]` for build or continuous integration improvements. +- `[Misc]` for PRs that do not fit the above categories. Please use this sparingly. + +:::{note} +If the PR spans more than one category, please include all relevant prefixes. +::: + +## Others + +If you find any problem when contributing, you can join our slack group to talk with us and then feel free to submit a PR to improve the doc to help other developers. + +:::{toctree} +:caption: Index +:maxdepth: 1 +testing +multi_node_test +::: \ No newline at end of file diff --git a/docs/source/developer_guide/contribution/index.md b/docs/source/developer_guide/contribution/index.md index e5b0a5e..8780ed7 100644 --- a/docs/source/developer_guide/contribution/index.md +++ b/docs/source/developer_guide/contribution/index.md @@ -1,70 +1,5 @@ # Contributing ## Building and Testing -It's recommended to set up a local development environment to build vllm-kunlun and run tests -before you submit a PR. -#### Run models locally - -After completing Run lint setup which is shown in quicksatrt, you can run your changed locally: - -```{code-block} bash - :substitutions: - -python -m vllm.entrypoints.openai.api_server \ - --host 0.0.0.0 \ - --port 8356 \ - --model /your_modified_models\ - --trust-remote-code \ - --tensor-parallel-size 1 \ - --no-enable-prefix-caching \ - --no-enable-chunked-prefill \ - --distributed-executor-backend mp \ - --served-model-name your_modified_models \ - --compilation-config '{"splitting_ops": ["vllm.unified_attention_with_output_kunlun", - "vllm.unified_attention", "vllm.unified_attention_with_output", - "vllm.mamba_mixer2"]}' \ -``` -Please save a screenshot of your service running successfully, and attach an accuracy report. - -#### Submit the commit - -```bash -# Commit changed files using `-s` -git commit -sm "your commit info" -``` - -🎉 Congratulations! You have completed the development environment setup. - - -## PR Title and Classification - -Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following: - -- `[Attention]` for new features or optimization in attention. -- `[Communicator]` for new features or optimization in communicators. -- `[ModelRunner]` for new features or optimization in model runner. -- `[Platform]` for new features or optimization in platform. -- `[Worker]` for new features or optimization in worker. -- `[Core]` for new features or optimization in the core vllm-kunlun logic (such as platform, attention, communicators, model runner) -- `[Kernel]` for changes affecting compute kernels and ops. -- `[Bugfix]` for bug fixes. -- `[Doc]` for documentation fixes and improvements. -- `[Test]` for tests (such as unit tests). -- `[CI]` for build or continuous integration improvements. -- `[Misc]` for PRs that do not fit the above categories. Please use this sparingly. - -:::{note} -If the PR spans more than one category, please include all relevant prefixes. -::: - -## Others - -If you find any problem when contributing, you can join our slack group to talk with us and then feel free to submit a PR to improve the doc to help other developers. - -:::{toctree} -:caption: Index -:maxdepth: 1 -testing -multi_node_test -::: \ No newline at end of file +Comming soon... \ No newline at end of file diff --git a/docs/source/developer_guide/evaluation/accuracy/accuracy_server.md b/docs/source/developer_guide/evaluation/accuracy/accuracy_server.md index b0d0d1f..57a1965 100644 --- a/docs/source/developer_guide/evaluation/accuracy/accuracy_server.md +++ b/docs/source/developer_guide/evaluation/accuracy/accuracy_server.md @@ -88,24 +88,20 @@ if not os.path.exists(output_dir): # Step 4: Check if the directory exists # dump the mixed data to a jsonl file dump_jsonl_data(mixed_data, output_path) # Step 6: Securely write to the file ``` - Dataset composition visualization: - ``` ┌───────────────────────────────────────┐ │ VL-Test (1000 samples) │ ├─────────────────┬─────────────────────┤ │ PureText │ Vision │ -│ (333 samples) │ (667 samples) │ +│ (333 样本) │ (667 样本) │ ├─────────────────┼─────────────────────┤ │ • mmlu_pro │ • math_vista │ │ • ifeval │ • mmmu_pro │ │ • gsm8k │ │ └─────────────────┴─────────────────────┘ ``` - #### 3.Test - ```python from dotenv import dotenv_values @@ -138,14 +134,13 @@ task_cfg = TaskConfig( run_task(task_cfg=task_cfg) ``` - Parameter Tuning Guide: -| Parameter | Current value | Effect | Adjustment suggestions | -| ----------------- | ------------- | ---------------------------------------- | -------------------------------------------------------- | -| `temperature` | 0.6 | Control output diversity | Math problems ↓ 0.3 / Creative writing ↑ 0.9 | -| `top_p` | 0.95 | Filtering low-probability tokens | Reduce "nonsense" | -| `eval_batch_size` | 5 | Number of requests processed in parallel | With sufficient video memory, it can be increased to 10. | +| Parameter | Current value | Effect | Adjustment suggestions | +| ----------------- | ------ | --------------- | ----------------------- | +| `temperature` | 0.6 | Control output diversity | Math problems ↓ 0.3 / Creative writing ↑ 0.9 | +| `top_p` | 0.95 | Filtering low-probability tokens | Reduce "nonsense" | +| `eval_batch_size` | 5 | Number of requests processed in parallel | With sufficient video memory, it can be increased to 10. | Run the test: @@ -172,12 +167,11 @@ python accuracy.py 2>&1 | tee "$LOG_FILE" # ======================================== EXIT_CODE=${PIPESTATUS[0]} if [ $EXIT_CODE -eq 0 ]; then - echo "✅ Evaluation completed! Log saved to: $LOG_FILE" + echo "✅ 评测完成! 日志已保存到: $LOG_FILE" else - echo "❌ Evaluation failed! Exit code: $EXIT_CODE Please check the log: $LOG_FILE" + echo "❌ 评测失败! 退出码: $EXIT_CODE 请查看日志: $LOG_FILE" fi ``` - #### 4.Common problem fixes ##### 4.1 NLTK resource missing fix @@ -187,7 +181,6 @@ Resource punkt_tab not found. ``` Solution: - ```python import nltk import os @@ -200,13 +193,13 @@ os.makedirs(download_dir, exist_ok=True) nltk.data.path.append(download_dir) # Step 3: Download necessary resources -print("🔽 Start downloading punkt_tab resource...") +print("🔽 开始下载punkt_tab资源...") try: nltk.download("punkt_tab", download_dir=download_dir) - print("✅ Download successful!") + print("✅ 下载成功!") except Exception as e: - print(f"❌ Download failed: {e}") - print("💡 Alternative: Download manually from GitHub") + print(f"❌ 下载失败: {e}") + print("💡 备选方案:手动从GitHub下载") print( " URL: https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt_tab.zip" ) diff --git a/docs/source/developer_guide/performance/performance_benchmark/benchmark_kernel.md b/docs/source/developer_guide/performance/performance_benchmark/benchmark_kernel.md index 0330a02..2a32c05 100644 --- a/docs/source/developer_guide/performance/performance_benchmark/benchmark_kernel.md +++ b/docs/source/developer_guide/performance/performance_benchmark/benchmark_kernel.md @@ -34,9 +34,9 @@ The fork pattern is used to track the entire time period from the start to the e /xxxx/xxxx/xprofiler -r500 --xpu=0 python test.py ``` -- --r: Sets the trace time resolution in nanoseconds (ns). The default is 100. If an "out of space error" occurs, try increasing the -r value to 500. +* --r: Sets the trace time resolution in nanoseconds (ns). The default is 100. If an "out of space error" occurs, try increasing the -r value to 500. -- --xpu: Specifies the acquisition device ID, supporting multi-card configuration. --xpu=all enables all cards; the default is card 0. +* --xpu: Specifies the acquisition device ID, supporting multi-card configuration. --xpu=all enables all cards; the default is card 0. More parameters can be found in the command-line parameters section later. @@ -58,7 +58,7 @@ A temporary .sock file will be generated in the execution directory. The path ne ```bash export XPU_ENABLE_PROFILER_TRACING=1 -export XPU_TRACING_OUTPUT_NAME=/xprofiler.sock +export XPU_TRACING_OUTPUT_NAME=/xprofiler.sock # Start your own program python xxx.py ``` @@ -99,7 +99,7 @@ xprofiler.sock ```python export XPU_ENABLE_PROFILER_TRACING=1 # Here, the path to the .sock file from step 2 is used for assignment. -export XPU_TRACING_OUTPUT_NAME=/xprofiler.sock +export XPU_TRACING_OUTPUT_NAME=/xprofiler.sock # Start your own program python xxx.py ``` @@ -108,21 +108,21 @@ Note: If you want to specify a particular card to run on, you must import the XP ##### More parameters -| parameters | Example | default value | describe | -| -------------------------- | --------------------------------------- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| -b or --buffer-size | -b=512 | 256 | Specifies the size of the trace buffer in MB. This is generally not required. However, if there are many trace signals, the buffer size can be increased appropriately to avoid OOS (Out of Size). | -| -x or --xpu | -x=0--xpu=0 | 0 | Set the card number to be tracked; multiple cards or all cards can be set. | -| -t or --time | -t=10 | off | Enable time mode, in seconds, to capture information over a specified period. | -| -d or --deamonize | -r500 | 0 | Enable daemon mode to retrieve events in the background. | -| -r or --export-profile | -e ./trace_output-e ./output/trace.json | ./ | Record the trace results to a document or folder. If this parameter is not specified, a default xprofiler.trace.json file will be generated in the execution directory. | -| -S or --settings | -S xprofiler.trace.json | off | xprofiler reads a JSON file containing the events that need to be traced. If this parameter is not configured, xprofiler enables `--profile-api-trace` and `--sse-trace` by default. | -| -A or --profiler-api-trace | -A | on | Get driver events. | -| -s or --sse-trace | -s | on | Get all SSE events. | -| -C or --cluster-trace | -C | off | Retrieve all cluster events. | -| -n or --sdnn-trace | -n | off | Get all SDNN events. | -| -c or --sdnn-cluster-trace | -c | off | Retrieve all SDNN cluster events. | -| -E or --cache-trace | -E | off | Get bandwidth statistics events. | -| -u or --debug | -u44:open log,debug level-u0:close log | 33 | Debug the interface and enable driver event/device event logging.。 | +| parameters | Example | default value | describe | +| -------------------------- | --------------------------------------- | ------ | ------------------------------------------------------------ | +| -b or --buffer-size | -b=512 | 256 | Specifies the size of the trace buffer in MB. This is generally not required. However, if there are many trace signals, the buffer size can be increased appropriately to avoid OOS (Out of Size). | +| -x or --xpu | -x=0--xpu=0 | 0 | Set the card number to be tracked; multiple cards or all cards can be set. | +| -t or --time | -t=10 | off | Enable time mode, in seconds, to capture information over a specified period. | +| -d or --deamonize | -r500 | 0 | Enable daemon mode to retrieve events in the background. | +| -r or --export-profile | -e ./trace_output-e ./output/trace.json | ./ | Record the trace results to a document or folder. If this parameter is not specified, a default xprofiler.trace.json file will be generated in the execution directory. | +| -S or --settings | -S xprofiler.trace.json | off | xprofiler reads a JSON file containing the events that need to be traced. If this parameter is not configured, xprofiler enables `--profile-api-trace` and `--sse-trace` by default. | +| -A or --profiler-api-trace | -A | on | Get driver events. | +| -s or --sse-trace | -s | on | Get all SSE events. | +| -C or --cluster-trace | -C | off | Retrieve all cluster events. | +| -n or --sdnn-trace | -n | off | Get all SDNN events. | +| -c or --sdnn-cluster-trace | -c | off | Retrieve all SDNN cluster events. | +| -E or --cache-trace | -E | off | Get bandwidth statistics events. | +| -u or --debug | -u44:open log,debug level-u0:close log | 33 | Debug the interface and enable driver event/device event logging.。 | #### 3.View Results @@ -144,4 +144,4 @@ Search directly, or visit[Perfetto UI](https://ui.perfetto.dev/#!/viewer?local_c With various performance data available, analysis and optimization can then be performed based on the results. -(Further details to be added later) +(Further details to be added later) \ No newline at end of file diff --git a/docs/source/developer_guide/performance/performance_benchmark/benchmark_server.md b/docs/source/developer_guide/performance/performance_benchmark/benchmark_server.md index b16615e..5ab8623 100644 --- a/docs/source/developer_guide/performance/performance_benchmark/benchmark_server.md +++ b/docs/source/developer_guide/performance/performance_benchmark/benchmark_server.md @@ -11,26 +11,30 @@ You can directly use vLLM's CLI benchmark. For more details, please refer to[vLL Server startup script reference ```bash -USE_ORI_ROPE=1 VLLM_USE_V1=1 python -m vllm.entrypoints.openai.api_server \ +python -m vllm.entrypoints.openai.api_server \ --host 0.0.0.0 \ - --port xxxx \ - --model /xxxx/xxxx/model\ + --port 8000 \ + --model /xxxx/xxxx/mkdel\ --gpu-memory-utilization 0.9 \ --trust-remote-code \ --max-model-len 32768 \ --tensor-parallel-size 1 \ --dtype float16 \ - --max_num_seqs 128 \ - --max_num_batched_tokens 32768 \ - --max-seq-len-to-capture 32768 \ - --block-size 128 \ --no-enable-prefix-caching \ --no-enable-chunked-prefill \ --distributed-executor-backend mp \ --served-model-name modelname \ - --compilation-config '{"splitting_ops": ["vllm.unified_attention_with_output_kunlun", - "vllm.unified_attention", "vllm.unified_attention_with_output", - "vllm.mamba_mixer2"]}' \ + --compilation-config '{"splitting_ops": ["vllm.unified_attention", + "vllm.unified_attention_with_output", + "vllm.unified_attention_with_output_kunlun", + "vllm.mamba_mixer2", + "vllm.mamba_mixer", + "vllm.short_conv", + "vllm.linear_attention", + "vllm.plamo2_mamba_mixer", + "vllm.gdn_attention", + "vllm.sparse_attn_indexer"]}' \ + ``` ##### 1.2Execute test @@ -124,26 +128,30 @@ The following demonstrates the performance test of the Qwen3-8B in a single-card The first step is to start the server. The example script is shown below. ```bash -USE_ORI_ROPE=1 VLLM_USE_V1=1 python -m vllm.entrypoints.openai.api_server \ +python -m vllm.entrypoints.openai.api_server \ --host 0.0.0.0 \ - --port xxxx \ - --model /xxxx/xxxx/Qwen3-8B\ + --port 8000 \ + --model /models/Qwen3-8B\ --gpu-memory-utilization 0.9 \ --trust-remote-code \ --max-model-len 32768 \ --tensor-parallel-size 1 \ --dtype float16 \ - --max_num_seqs 128 \ - --max_num_batched_tokens 32768 \ - --max-seq-len-to-capture 32768 \ - --block-size 128 \ --no-enable-prefix-caching \ --no-enable-chunked-prefill \ --distributed-executor-backend mp \ - --served-model-name Qwen3-8B \ - --compilation-config '{"splitting_ops": ["vllm.unified_attention_with_output_kunlun", - "vllm.unified_attention", "vllm.unified_attention_with_output", - "vllm.mamba_mixer2"]}' \ + --served-model-name Qwen3-8B-Instruct \ + --compilation-config '{"splitting_ops": ["vllm.unified_attention", + "vllm.unified_attention_with_output", + "vllm.unified_attention_with_output_kunlun", + "vllm.mamba_mixer2", + "vllm.mamba_mixer", + "vllm.short_conv", + "vllm.linear_attention", + "vllm.plamo2_mamba_mixer", + "vllm.gdn_attention", + "vllm.sparse_attn_indexer"]}' \ + ``` ##### 2.2 Start EvalScope diff --git a/docs/source/developer_guide/performance/performance_benchmark/index.md b/docs/source/developer_guide/performance/performance_benchmark/index.md index 92dbb1d..e480bb5 100644 --- a/docs/source/developer_guide/performance/performance_benchmark/index.md +++ b/docs/source/developer_guide/performance/performance_benchmark/index.md @@ -7,5 +7,4 @@ This document details the performance testing methods for vllm-kunlun and the an :maxdepth: 1 benchmark_server benchmark_kernel -profiling ::: \ No newline at end of file diff --git a/docs/source/developer_guide/performance/performance_benchmark/profiling.md b/docs/source/developer_guide/performance/performance_benchmark/profiling.md deleted file mode 100644 index 81b705d..0000000 --- a/docs/source/developer_guide/performance/performance_benchmark/profiling.md +++ /dev/null @@ -1,418 +0,0 @@ -## Profiling - - - -### 🔧 Action Plan(Three Phases) -#### Phase 1️⃣: Multi-Device Log Redirection Configuration -##### Background -By default, kernel logs from all 8 XPU devices are interleaved and emitted to [stdout], resulting in: -- It becomes impossible to distinguish which log originates from which device. -- Timestamps become interleaved, making it difficult to analyze the temporal relationships. -- Single-device bottlenecks are masked by global aggregation. - -##### Solution -During model initialization, create separate log files for each device. -##### Code Explanation (embedded in qwen2.py) -```python -import os # ← Ensure this is imported at the top of the file -from vllm.distributed import get_tensor_model_parallel_rank # ← Import function to get the tensor model parallel rank - -class Qwen2Model(nn.Module): - - def __init__(self, - *, - vllm_config: VllmConfig, - prefix: str = "", - decoder_layer_type: type[nn.Module] = Qwen2DecoderLayer): - super().__init__() - - # ========== [Expert Solution] Kunlun XPU Multi-Device Log Redirection ========== - try: - # Step 1: Get the current XPU device's rank (0~7) - rank = get_tensor_model_parallel_rank() - - # Step 2: Create log directory (works with your get_kernel_time_ex.py) - log_dir = "./xpu_logs" - os.makedirs(log_dir, exist_ok=True) - - # Step 3: Generate a separate log file for each device - log_file = os.path.join(log_dir, f"rank_{rank}.log") - - # Step 4: Core operation – redirect file descriptors - # os.O_TRUNC: Clear previous logs on each run to avoid mixing outputs - fd = os.open(log_file, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o664) - os.dup2(fd, 1) # Redirect stdout → rank_X.log - os.dup2(fd, 2) # Redirect stderr → rank_X.log - os.close(fd) # Close original file descriptor; redirection persists - - # Optional: print a confirmation message (will go into rank_X.log) - print(f"[Qwen2Model Init] Rank {rank} log redirected to {log_file}") - - except Exception as e: - # Fallback mechanism: failure to redirect logs does not affect model loading - print(f"[WARNING] Failed to redirect log for rank: {e}", flush=True) - # ========== End of log redirection code ========== - -``` -##### ⚠️ Common Issues -**Q1**:Why not use Python's `logging` module? -**A**:The XPU runtime kernel logs are emitted from the C++ layer and cannot be captured by Python’s `logging` module. Redirection via low-level file descriptors is required. -**Q1**:Will logs be lost if the model fails to load?? -**A**:The `try-except` block ensures that if log redirection fails, it falls back to the default behavior without affecting model startup. - -#### Phase 2️⃣: Profiling Environment Activation -##### 🚀 vLLM Launch -```bash -unset XPU_DUMMY_EVENT -export XPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 -export XPU_USE_MOE_SORTED_THRES=1 -export XFT_USE_FAST_SWIGLU=1 -export XMLIR_CUDNN_ENABLED=1 -export XPU_USE_DEFAULT_CTX=1 -export XMLIR_FORCE_USE_XPU_GRAPH=1 -export XPU_USE_FAST_SWIGLU=1 -export VLLM_HOST_IP=$(hostname -i) -echo "VLLM_HOST_IP: $VLLM_HOST_IP" - -export XMLIR_ENABLE_MOCK_TORCH_COMPILE=false - -export XPUAPI_DEBUG=0x1 # Enable kernel performance logging -export XPURT_DISPATCH_MODE=PROFILING # Activate profiling mode - -USE_ORI_ROPE=1 VLLM_USE_V1=1 python -m vllm.entrypoints.openai.api_server \ -      --host 0.0.0.0 \ -      --port 8000 \ -      --model /models/Qwen2.5-72B-Instruct \ -      --gpu-memory-utilization 0.9 \ -      --trust-remote-code \ -      --max-model-len 32768 \ -      --tensor-parallel-size 8 \ -      --dtype float16 \ -      --max_num_seqs 512 \ -      --max_num_batched_tokens 32768 \ -      --max-seq-len-to-capture 32768 \ -      --block-size 128 \ -      --no-enable-prefix-caching \ -      --no-enable-chunked-prefill \ -      --distributed-executor-backend mp \ -      --served-model-name Qwen2.5-72B-Instruct \ -      --compilation-config '{"splitting_ops": ["vllm.unified_attention_with_output_kunlun", - "vllm.unified_attention", "vllm.unified_attention_with_output", - "vllm.mamba_mixer2"]}' 2>&1 | tee output_p800.log - -``` - - -##### 🚀 Client Load Testing -```bash -#!/bin/bash - -# Define test combinations array (concurrency x input length x output length) -TEST_COMBINATIONS=( - "8x1024x1024" # Medium-low concurrency -) - -# Create result directory -RESULT_DIR="bench_$(date +%Y%m%d_%H%M)" -mkdir -p $RESULT_DIR - -# Summary results file -SUMMARY_FILE="$RESULT_DIR/summary_results.csv" -echo "num_prompts,input_len,output_len,throughput,latency_mean,latency_p50,latency_p90,latency_p99" >$SUMMARY_FILE - -# Progress counter -TOTAL_TESTS=${#TEST_COMBINATIONS[@]} -CURRENT_TEST=0 - -# Loop through different test combinations -for COMBINATION in "${TEST_COMBINATIONS[@]}"; do - # Parse combination parameters - NUM_PROMPTS=$(echo $COMBINATION | cut -d'x' -f1) - INPUT_LEN=$(echo $COMBINATION | cut -d'x' -f2) - OUTPUT_LEN=$(echo $COMBINATION | cut -d'x' -f3) - - # Update progress - CURRENT_TEST=$((CURRENT_TEST + 1)) - - echo "==========================================================" - echo "Test progress: $CURRENT_TEST/$TOTAL_TESTS ($(printf "%.1f" $(echo "$CURRENT_TEST/$TOTAL_TESTS*100" | bc -l))%)" - echo "Current test configuration: concurrency=$NUM_PROMPTS, input length=$INPUT_LEN, output length=$OUTPUT_LEN" - echo "==========================================================" - - OUTPUT_FILE="$RESULT_DIR/p800_${NUM_PROMPTS}_${INPUT_LEN}_${OUTPUT_LEN}.log" - - # Run benchmark - python3 -m vllm.entrypoints.cli.main bench serve \ - --host 127.0.0.1 \ - --port 8000 \ - --backend vllm \ - --model Qwen2.5-72B-Instruct \ - --dataset-name random \ - --num-prompts $NUM_PROMPTS \ - --random-input-len $INPUT_LEN \ - --random-output-len $OUTPUT_LEN \ - --tokenizer /ssd1/models/Qwen2.5-72B-Instruct \ - --ignore-eos 2>&1 | tee $OUTPUT_FILE - - # Wait 15 seconds to let the service recover - echo "Waiting 15 seconds before the next round..." - sleep 15 - - # Extract key performance metrics from output and append to summary file - THROUGHPUT=$(grep "Throughput" $OUTPUT_FILE | awk '{print $2}') - LATENCY_MEAN=$(grep "Mean latency" $OUTPUT_FILE | awk '{print $3}') - LATENCY_P50=$(grep "p50 latency" $OUTPUT_FILE | awk '{print $3}') - LATENCY_P90=$(grep "p90 latency" $OUTPUT_FILE | awk '{print $3}') - LATENCY_P99=$(grep "p99 latency" $OUTPUT_FILE | awk '{print $3}') - - echo "$NUM_PROMPTS,$INPUT_LEN,$OUTPUT_LEN,$THROUGHPUT,$LATENCY_MEAN,$LATENCY_P50,$LATENCY_P90,$LATENCY_P99" >>$SUMMARY_FILE -done - -# Output summary report -echo "==========================================================" -echo "Benchmark completed! Results saved in: $RESULT_DIR" -echo "==========================================================" - - -``` - -#### Phase 3️⃣: Log Analysis and Bottleneck Identification -```lua -xpu_logs/ -├─ rank_0.log -├─ rank_1.log -├─ rank_2.log -├─ rank_3.log -├─ rank_4.log -├─ rank_5.log -├─ rank_6.log -└─ rank_7.log - -``` -##### 🔍 Script Workflow (op_log.py) -**Input**:Raw Kernel Logs (Sample Format) -``` -[XPURT_PROF] void xblas_xpu3::fc_cdnn_infer 123456 ns -[XPURT_PROF] void kl3_all_reduce 987654 ns -``` -**Processing logic** -:::::{tab-set} -::::{tab-item} op_log.py - - -```python -""" -A better version of 'get_op_time.py', get more level dump and support kl3. -  -Usage: python3 get_kernel_time_ex.py --help -""" -  -import os -import sys -import re -  -unit_factors = [0.9, 1.3, 1.45] # kunlun1, kunlun2, kunlun3 -patterns = ["\[XPURT_PROF\] (\S+)\s+\S+\s+(\S+) ns", "\[XPURT_PROF\] (\S+)\s+(\S+)\s+\S+ ns"] -tab_space_num = int(4) -  -def get_total_time(res): -    total_time = 0.0 -    for i in res.values(): -        total_time += i -    return  total_time -  -def print_info_op(res, cnt, unit, op): -    total_time = get_total_time(res) -    total_cnt = 0 -    # print detailed op time -    lis=sorted(res.items(), key=lambda d:d[1], reverse=True) -    if sys.version_info.major == 2: -        import commands -        for i in range(len(lis)): -            (status, cmd_output) = commands.getstatusoutput("c++filt {}".format(lis[i][0])) -            if status == 0: -                formt_type = (cmd_output.split('('))[0] -            total_cnt += cnt[lis[i][0]] -    elif sys.version_info.major == 3: -        import subprocess -        for i in range(len(lis)): -            (status, cmd_output) = subprocess.getstatusoutput("c++filt {}".format(lis[i][0])) -            if status == 0: -                formt_type = (cmd_output.split('('))[0] -            total_cnt += cnt[lis[i][0]] -    print(f"{op} {total_time / unit} {total_cnt}") -  -def print_info_kernel(res, cnt, unit): -    total_time = get_total_time(res) -    total_cnt = 0 -    print("Total time(ms) is {}".format(total_time / unit)) -    # print detailed op time -    lis=sorted(res.items(), key=lambda d:d[1], reverse=True) -    if sys.version_info.major == 2: -        print("{:<90}{:<10}{:<15}{:<15}".format("Op type", "count", "time(ms)", "%")) -        import commands -        for i in range(len(lis)): -            (status, cmd_output) = commands.getstatusoutput("c++filt {}".format(lis[i][0])) -            if status == 0: -                formt_type = (cmd_output.split('('))[0] -            print("{:<90}{:<10}{:<15}{:<15.5}".format(formt_type, cnt[lis[i][0]], lis[i][1] / unit, \ -                lis[i][1] / total_time * 100)) -            total_cnt += cnt[lis[i][0]] -    elif sys.version_info.major == 3: -        print("{:<90}{:<10}{:<20}{:<20}".format("Op type", "count", "time(ms)", "%")) -        import subprocess -        for i in range(len(lis)): -            (status, cmd_output) = subprocess.getstatusoutput("c++filt {}".format(lis[i][0])) -            if status == 0: -                formt_type = (cmd_output.split('('))[0] -            print("{:<150}{:<10}{:<25}{:<20.5}".format(formt_type, cnt[lis[i][0]], lis[i][1] / unit, \ -                lis[i][1] / total_time * 100)) -            total_cnt += cnt[lis[i][0]] -  -    print("Total count is {}".format(total_cnt)) -  -def count_head_spaces(s: str) -> int: -    -    count = 0 -    for char in s: -        if char == ' ': -            count += 1 -        else: -            break -    return count -  -def process_line(lines, pattern1, unit_factor, dump_level): -    """ process a line in a file with profiling info -  -    Args: -        unit_factor: A factor differentiated by KUNLUN1 and KUNLUN2 -  -    """ -    res = {} -    cnt = {} -    op = "init_op" -    unit = unit_factor * 1000 * 1000 # ns -> ms -    wait_next_one = False -    for i in range(len(lines)): -        cur_line = lines[i] -        if "gtest_" in cur_line: -            cur_level = count_head_spaces(cur_line) / tab_space_num -            if cur_level == dump_level: -                wait_next_one = False -                print_info_op(res, cnt, unit, op) -                # clear buf -                res = {} -                cnt = {} -                op = cur_line.lstrip().rstrip() -            elif cur_level < dump_level: -                wait_next_one = True -                # skip record kernel time untime next one -                continue -        if wait_next_one: -            # skip record kernel time -            continue -        match = re.match(pattern1, lines[i]) -        if match: -            op_type = match.group(1) -            op_time = match.group(2) -            if op_type in res: -                res[op_type] += float(op_time) -                cnt[op_type] += 1 -            else: -                res[op_type] = float(op_time) -                cnt[op_type] = 1 -  -    # get left total time -    if dump_level == -1: -        print_info_kernel(res, cnt, unit) -    else: -        print_info_op(res, cnt, unit, op) -    return res -  -def process_file(file_name, pattern2, unit_factor, dump_level = -1): -    """ Process a file line by line -  -    Iteratively process each line in the target file. -  -    """ -  -    with open(file_name, "r") as f: -        lines = f.readlines() -        f1_res_list = process_line(lines, pattern2, unit_factor, dump_level) -  -if __name__ == '__main__': -    import argparse -  - -    parser = argparse.ArgumentParser() -  - -    group = parser.add_mutually_exclusive_group() -    group.add_argument('-xpu1', action='store_true', help='指定为 xpu1') -    group.add_argument('-xpu2', action='store_true', help='指定为 xpu2') -    group.add_argument('-xpu3', action='store_true', help='指定为 xpu3') -    parser.add_argument('--level', type=int, default=-1, help='指定 dump 缩进级别(默认为 -1)') - -    parser.add_argument('filename', help='要处理的文件名') -  - -    args = parser.parse_args() -  - -    filename = args.filename -    xpu_version = 0 -    if args.xpu2: -        xpu_version = 1 -    if args.xpu3: -        xpu_version = 2 -    dump_level = args.level -    print(f'Filename: {filename}') -    print(f'-xpu option: {xpu_version}') -    print(f'--level option: {dump_level}') -  -    unit_factor = unit_factors[xpu_version] -    pattern_idx = 0 -    if xpu_version > 0: -        pattern_idx = 1 -    process_file(filename, patterns[pattern_idx], unit_factor, dump_level) -  -``` - -:::: - -::::{tab-item} op_log.sh - - - -```bash - -for i in {0..7}; do -    python op_log.py -xpu3 xpu_logs/rank_${i}.log > analysis_rank${i}.log -    echo "Rank ${i} 分析完成" -done - - -for i in {0..7}; do -    echo "=== Rank $i ==="  -    head -n 6 analysis_rank${i}.log | tail -n 5 -done -``` -:::: -::::: -##### 📈 Output Example (analysis_rank0.log) -``` -Filename: xpu_logs/rank_0.log --xpu option: 2 ---level option: -1 -Total time(ms) is 53742.29571862069 -Op type                                                                                   count     time(ms)            %                    -void xblas_xpu3::fc_cdnn_infer                                                     661569    22736.262780689656       42.306               -void kl3_all_reduce                                                                                                                          176134    14782.525712413793       27.506               -void kl3_all_reduce_butterfly                                                                                                                164864    4197.28395862069         7.81            -``` -##### 🚨 Troubleshooting Guide -|Symptom|Cause|Solution| -|-|-|-| -|`xpu_logs` directory is empty|XPUAPI_DEBUG not enabled|Verify that the environment variable is correctly set| -All 8 log files have identical content|Multi-process backend not activated|Ensure `--distributed-executor-backend` mp is specified| -|Throughput drops >15%|Profiling overhead too high|Enable profiling only during analysis; disable in production| \ No newline at end of file diff --git a/docs/source/faqs.md b/docs/source/faqs.md index a6169b7..7d6365c 100644 --- a/docs/source/faqs.md +++ b/docs/source/faqs.md @@ -2,7 +2,7 @@ ## Version Specific FAQs -- [[v0.10.1.1] FAQ & Feedback] +- [[v0.11.0] FAQ & Feedback] ## General FAQs @@ -20,12 +20,13 @@ We will support the kunlun4 M100 platform in early 2026. ### 2. How to get our docker containers? -**base**:`docker pull wjie520/vllm_kunlun:v0.0.1`. +**base**:`docker pull iregistry.baidu-int.com/xmlir/xmlir_ubuntu_2004_x86_64:v0.32`. +**full**:`docker pull wjie520/vllm_kunlun:v0.0.1`. ### 3. How vllm-kunlun work with vLLM? -vllm-kunlun is a hardware plugin for vLLM. Basically, the version of vllm-kunlun is the same as the version of vllm. For example, if you use vllm 0.10.1.1, you should use vllm-kunlun 0.10.1.1 as well. For main branch, we will make sure `vllm-kunlun` and `vllm` are compatible by each commit. +vllm-kunlun is a hardware plugin for vLLM. Basically, the version of vllm-kunlun is the same as the version of vllm. For example, if you use vllm 0.11.0, you should use vllm-kunlun 0.11.0 as well. For main branch, we will make sure `vllm-kunlun` and `vllm` are compatible by each commit. ### 4. How to handle the out-of-memory issue? diff --git a/docs/source/index.md b/docs/source/index.md index 5146c83..6818df8 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -16,9 +16,9 @@

-Star -Watch -Fork +Star +Watch +Fork

::: diff --git a/docs/source/installation.md b/docs/source/installation.md index 0bcde7d..c8e13b7 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -11,7 +11,7 @@ This document describes how to install vllm-kunlun manually. - vLLM (same version as vllm-kunlun) ## Setup environment using container -We provide a clean, minimal base image for your use`wjie520/vllm_kunlun:v0.0.1`.You can pull it using the `docker pull` command. +We provide a clean, minimal base image for your use`iregistry.baidu-int.com/xmlir/xmlir_ubuntu_2004_x86_64:v0.32`.You can pull it using the `docker pull` command. ### Container startup script :::::{tab-set} @@ -31,7 +31,7 @@ if [ $XPU_NUM -gt 0 ]; then done DOCKER_DEVICE_CONFIG="${DOCKER_DEVICE_CONFIG} --device=/dev/xpuctrl:/dev/xpuctrl" fi -export build_image="wjie520/vllm_kunlun:v0.0.1" +export build_image="iregistry.baidu-int.com/xmlir/xmlir_ubuntu_2004_x86_64:v0.32" docker run -itd ${DOCKER_DEVICE_CONFIG} \ --net=host \ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ @@ -46,16 +46,16 @@ docker run -itd ${DOCKER_DEVICE_CONFIG} \ :::: ::::: ## Install vLLM-kunlun -### Install vLLM 0.10.1.1 +### Install vLLM 0.11.0 ``` conda activate python310_torch25_cuda -pip install vllm==0.10.1.1 --no-build-isolation --no-deps +pip install vllm==0.11.0 ``` ### Build and Install Navigate to the vllm-kunlun directory and build the package: ``` -git clone https://github.com/baidu/vLLM-Kunlun # TODO: replace with Github Url to install vllm-kunlun +git clone xxxx # TODO: replace with Github Url to install vllm-kunlun cd vllm-kunlun @@ -71,28 +71,33 @@ Copy the eval_frame.py patch: ``` cp vllm_kunlun/patches/eval_frame.py /root/miniconda/envs/python310_torch25_cuda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py ``` -## Update xpytorch +## Install the KL3-customized build of PyTorch ``` -wget https://klx-sdk-release-public.su.bcebos.com/kunlun2aiak_output/0830/xpytorch-cp310-torch251-ubuntu2004-x64.run - -bash xpytorch-cp310-torch251-ubuntu2004-x64.run +wget https://klx-sdk-release-public.su.bcebos.com/xpytorch/release/3.3.2.7/xpytorch-cp310-torch251-ubuntu2004-x64.run && bash xpytorch-cp310-torch251-ubuntu2004-x64.run ``` ## Install custom ops ``` -pip install \ -https://xtorch_ops - -pip install \ -https://xspeedgate_ops-0.0.0-cp310-cp310-linux_x86_64.whl +pip uninstall xtorch_ops -y && pip install \ +"https://baidu-kunlun-public.su.bcebos.com/v1/baidu-kunlun-share/xtorch_ops-0.1.2028%2B1baf1b15-cp310-cp310-linux_x86_64.whl?authorization=bce-auth-v1%2FALTAKypXxBzU7gg4Mk4K4c6OYR%2F2025-10-31T10%3A38%3A24Z%2F-1%2Fhost%2Faa1969b70a4a97c407d69614a5d5a3e26ea07286d13f0a2ab8daccc288152903" ``` +## Install the KLX3 custom Triton build +``` +pip install \ +"https://cce-ai-models.bj.bcebos.com/v1/vllm-kunlun-0.11.0/triton-3.0.0%2Bb2cde523-cp310-cp310-linux_x86_64.whl?authorization=bce-auth-v1%2FALTAKxPW2jzoJUuFZmI19s3yry%2F2025-11-05T02%3A47%3A29Z%2F-1%2Fhost%2Fd8c95dbd06187a3140ca3e681e00c6941c30e14bb1d4112a0c8bc3c93e5c9c3f" +``` +## Install the AIAK custom ops library +``` +pip install \ +"https://cce-ai-models.bj.bcebos.com/v1/chenyili/xspeedgate_ops-0.0.0-cp310-cp310-linux_x86_64.whl?authorization=bce-auth-v1%2FALTAKxPW2jzoJUuFZmI19s3yry%2F2025-11-18T01%3A56%3A21Z%2F-1%2Fhost%2F28b57cbc5dc62ac1bf946e74146b3ea4952d2ffff448617f0303980dcaf6cb49" +``` ## Quick Start ### Set up the environment ``` -chmod +x /workspace/vllm-kunlun/setup_env.sh && source /workspace/vllm-kunlun/setup_env.sh +chmod +x /workspace/baidu/hac-aiacc/vllm-kunlun/setup_env.sh && source /workspace/baidu/hac-aiacc/vllm-kunlun/setup_env.sh ``` ### Run the server @@ -107,7 +112,7 @@ chmod +x /workspace/vllm-kunlun/setup_env.sh && source /workspace/vllm-kunlun/se python -m vllm.entrypoints.openai.api_server \ --host 0.0.0.0 \ --port 8356 \ - --model /models/Qwen3-8B\ + --model models/Qwen3-VL-30B-A3B-Instruct \ --gpu-memory-utilization 0.9 \ --trust-remote-code \ --max-model-len 32768 \ @@ -115,15 +120,22 @@ python -m vllm.entrypoints.openai.api_server \ --dtype float16 \ --max_num_seqs 128 \ --max_num_batched_tokens 32768 \ - --max-seq-len-to-capture 32768 \ --block-size 128 \ --no-enable-prefix-caching \ --no-enable-chunked-prefill \ --distributed-executor-backend mp \ - --served-model-name Qwen3-8B \ - --compilation-config '{"splitting_ops": ["vllm.unified_attention_with_output_kunlun", - "vllm.unified_attention", "vllm.unified_attention_with_output", - "vllm.mamba_mixer2"]}' \ + --served-model-name Qwen3-VL-30B-A3B-Instruct \ + --compilation-config '{"splitting_ops": ["vllm.unified_attention", + "vllm.unified_attention_with_output", + "vllm.unified_attention_with_output_kunlun", + "vllm.mamba_mixer2", + "vllm.mamba_mixer", + "vllm.short_conv", + "vllm.linear_attention", + "vllm.plamo2_mamba_mixer", + "vllm.gdn_attention", + "vllm.sparse_attn_indexer"]}' \ + ``` :::: ::::: \ No newline at end of file diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/community/contributors.po b/docs/source/locale/zh_CN/LC_MESSAGES/community/contributors.po new file mode 100644 index 0000000..3c1b549 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/community/contributors.po @@ -0,0 +1,1328 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/community/contributors.md:1 +msgid "Maintainers and contributors" +msgstr "维护者和贡献者" + +#: ../../source/community/contributors.md:3 +msgid "Maintainers" +msgstr "维护者" + +#~ msgid "Name" +#~ msgstr "名称" + +#~ msgid "Github ID" +#~ msgstr "Github 账号" + +#~ msgid "Date" +#~ msgstr "日期" + +#~ msgid "Xiyuan Wang" +#~ msgstr "Xiyuan Wang" + +#~ msgid "[@wangxiyuan](https://github.com/wangxiyuan)" +#~ msgstr "[@wangxiyuan](https://github.com/wangxiyuan)" + +#~ msgid "2025/01" +#~ msgstr "2025/01" + +#~ msgid "Yikun Jiang" +#~ msgstr "Yikun Jiang" + +#~ msgid "[@Yikun](https://github.com/Yikun)" +#~ msgstr "[@Yikun](https://github.com/Yikun)" + +#~ msgid "2025/02" +#~ msgstr "2025/02" + +#~ msgid "Yi Gan" +#~ msgstr "Yi Gan" + +#~ msgid "[@ganyi1996ppo](https://github.com/ganyi1996ppo)" +#~ msgstr "[@ganyi1996ppo](https://github.com/ganyi1996ppo)" + +#~ msgid "Shoujian Zheng" +#~ msgstr "Shoujian Zheng" + +#~ msgid "[@jianzs](https://github.com/jianzs)" +#~ msgstr "[@jianzs](https://github.com/jianzs)" + +#~ msgid "2025/06" +#~ msgstr "2025/06" + +#~ msgid "Contributors" +#~ msgstr "贡献者" + +#~ msgid "" +#~ "vLLM Kunlun every release would not " +#~ "have been possible without the following" +#~ " contributors:" +#~ msgstr "每个 vLLM Kunlun 版本的发布都离不开以下贡献者:" + +#~ msgid "Updated on 2025-06-10:" +#~ msgstr "更新于 2025-06-10:" + +#~ msgid "Number" +#~ msgstr "数字" + +#~ msgid "Contributor" +#~ msgstr "贡献者" + +#~ msgid "Commit ID" +#~ msgstr "提交 ID" + +#~ msgid "83" +#~ msgstr "83" + +#~ msgid "[@ZhengWG](https://github.com/)" +#~ msgstr "[@ZhengWG](https://github.com/)" + +#~ msgid "2025/7/7" +#~ msgstr "2025/7/7" + +#~ msgid "" +#~ "[3a469de](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/9c886d0a1f0fc011692090b0395d734c83a469de)" +#~ msgstr "" +#~ "[3a469de](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/9c886d0a1f0fc011692090b0395d734c83a469de)" + +#~ msgid "82" +#~ msgstr "82" + +#~ msgid "[@wm901115nwpu](https://github.com/)" +#~ msgstr "[@wm901115nwpu](https://github.com/)" + +#~ msgid "" +#~ "[a2a47d4](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/f08c4f15a27f0f27132f4ca7a0c226bf0a2a47d4)" +#~ msgstr "" +#~ "[a2a47d4](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/f08c4f15a27f0f27132f4ca7a0c226bf0a2a47d4)" + +#~ msgid "81" +#~ msgstr "81" + +#~ msgid "[@Agonixiaoxiao](https://github.com/)" +#~ msgstr "[@Agonixiaoxiao](https://github.com/)" + +#~ msgid "2025/7/2" +#~ msgstr "2025/7/2" + +#~ msgid "" +#~ "[6f84576](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/7fc1a984890bd930f670deedcb2dda3a46f84576)" +#~ msgstr "" +#~ "[6f84576](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/7fc1a984890bd930f670deedcb2dda3a46f84576)" + +#~ msgid "80" +#~ msgstr "80" + +#~ msgid "[@zhanghw0354](https://github.com/zhanghw0354)" +#~ msgstr "[@zhanghw0354](https://github.com/zhanghw0354)" + +#~ msgid "" +#~ "[d3df9a5](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/9fb3d558e5b57a3c97ee5e11b9f5dba6ad3df9a5)" +#~ msgstr "" +#~ "[d3df9a5](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/9fb3d558e5b57a3c97ee5e11b9f5dba6ad3df9a5)" + +#~ msgid "79" +#~ msgstr "79" + +#~ msgid "[@GDzhu01](https://github.com/GDzhu01)" +#~ msgstr "[@GDzhu01](https://github.com/GDzhu01)" + +#~ msgid "2025/6/28" +#~ msgstr "2025/6/28" + +#~ msgid "" +#~ "[de256ac](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/b308a7a25897b88d4a23a9e3d583f4ec6de256ac)" +#~ msgstr "" +#~ "[de256ac](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/b308a7a25897b88d4a23a9e3d583f4ec6de256ac)" + +#~ msgid "78" +#~ msgstr "78" + +#~ msgid "[@leo-pony](https://github.com/leo-pony)" +#~ msgstr "[@leo-pony](https://github.com/leo-pony)" + +#~ msgid "2025/6/26" +#~ msgstr "2025/6/26" + +#~ msgid "" +#~ "[3f2a5f2](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/10253449120307e3b45f99d82218ba53e3f2a5f2)" +#~ msgstr "" +#~ "[3f2a5f2](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/10253449120307e3b45f99d82218ba53e3f2a5f2)" + +#~ msgid "77" +#~ msgstr "77" + +#~ msgid "[@zeshengzong](https://github.com/zeshengzong)" +#~ msgstr "[@zeshengzong](https://github.com/zeshengzong)" + +#~ msgid "" +#~ "[3ee25aa](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/192dbbcc6e244a8471d3c00033dc637233ee25aa)" +#~ msgstr "" +#~ "[3ee25aa](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/192dbbcc6e244a8471d3c00033dc637233ee25aa)" + +#~ msgid "76" +#~ msgstr "76" + +#~ msgid "[@sharonyunyun](https://github.com/sharonyunyun)" +#~ msgstr "[@sharonyunyun](https://github.com/sharonyunyun)" + +#~ msgid "2025/6/25" +#~ msgstr "2025/6/25" + +#~ msgid "" +#~ "[2dd8666](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/941269a6c5bbc79f6c1b6abd4680dc5802dd8666)" +#~ msgstr "" +#~ "[2dd8666](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/941269a6c5bbc79f6c1b6abd4680dc5802dd8666)" + +#~ msgid "75" +#~ msgstr "75" + +#~ msgid "[@Pr0Wh1teGivee](https://github.com/Pr0Wh1teGivee)" +#~ msgstr "[@Pr0Wh1teGivee](https://github.com/Pr0Wh1teGivee)" + +#~ msgid "" +#~ "[c65dd40](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/2fda60464c287fe456b4a2f27e63996edc65dd40)" +#~ msgstr "" +#~ "[c65dd40](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/2fda60464c287fe456b4a2f27e63996edc65dd40)" + +#~ msgid "74" +#~ msgstr "74" + +#~ msgid "[@xleoken](https://github.com/xleoken)" +#~ msgstr "[@xleoken](https://github.com/xleoken)" + +#~ msgid "2025/6/23" +#~ msgstr "2025/6/23" + +#~ msgid "" +#~ "[c604de0](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/4447e53d7ad5edcda978ca6b0a3a26a73c604de0)" +#~ msgstr "" +#~ "[c604de0](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/4447e53d7ad5edcda978ca6b0a3a26a73c604de0)" + +#~ msgid "73" +#~ msgstr "73" + +#~ msgid "[@lyj-jjj](https://github.com/lyj-jjj)" +#~ msgstr "[@lyj-jjj](https://github.com/lyj-jjj)" + +#~ msgid "" +#~ "[5cbd74e](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/5177bef87a21331dcca11159d3d1438075cbd74e)" +#~ msgstr "" +#~ "[5cbd74e](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/5177bef87a21331dcca11159d3d1438075cbd74e)" + +#~ msgid "72" +#~ msgstr "72" + +#~ msgid "[@farawayboat](https://github.com/farawayboat)" +#~ msgstr "[@farawayboat](https://github.com/farawayboat)" + +#~ msgid "2025/6/21" +#~ msgstr "2025/6/21" + +#~ msgid "" +#~ "[bc7d392](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/097e7149f75c0806774bc68207f0f6270bc7d392)" +#~ msgstr "" +#~ "[bc7d392](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/097e7149f75c0806774bc68207f0f6270bc7d392)" + +#~ msgid "71" +#~ msgstr "71" + +#~ msgid "[@yuancaoyaoHW](https://github.com/yuancaoyaoHW)" +#~ msgstr "[@yuancaoyaoHW](https://github.com/yuancaoyaoHW)" + +#~ msgid "2025/6/20" +#~ msgstr "2025/6/20" + +#~ msgid "" +#~ "[7aa0b94](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/00ae250f3ced68317bc91c93dc1f1a0977aa0b94)" +#~ msgstr "" +#~ "[7aa0b94](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/00ae250f3ced68317bc91c93dc1f1a0977aa0b94)" + +#~ msgid "70" +#~ msgstr "70" + +#~ msgid "[@songshanhu07](https://github.com/songshanhu07)" +#~ msgstr "[@songshanhu07](https://github.com/songshanhu07)" + +#~ msgid "2025/6/18" +#~ msgstr "2025/6/18" + +#~ msgid "" +#~ "[5e1de1f](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/2a70dbbdb8f55002de3313e17dfd595e1de1f)" +#~ msgstr "" +#~ "[5e1de1f](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/2a70dbbdb8f55002de3313e17dfd595e1de1f)" + +#~ msgid "69" +#~ msgstr "69" + +#~ msgid "[@wangyanhui-cmss](https://github.com/wangyanhui-cmss)" +#~ msgstr "[@wangyanhui-cmss](https://github.com/wangyanhui-cmss)" + +#~ msgid "2025/6/12" +#~ msgstr "2025/6/12" + +#~ msgid "" +#~ "[40c9e88](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/2a5fb4014b863cee6abc3009f5bc5340c9e88)" +#~ msgstr "" +#~ "[40c9e88](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/2a5fb4014b863cee6abc3009f5bc5340c9e88)" + +#~ msgid "68" +#~ msgstr "68" + +#~ msgid "[@chenwaner](https://github.com/chenwaner)" +#~ msgstr "[@chenwaner](https://github.com/chenwaner)" + +#~ msgid "2025/6/11" +#~ msgstr "2025/6/11" + +#~ msgid "" +#~ "[c696169](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/e46dc142bf1180453c64226d76854fc1ec696169)" +#~ msgstr "" +#~ "[c696169](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/e46dc142bf1180453c64226d76854fc1ec696169)" + +#~ msgid "67" +#~ msgstr "67" + +#~ msgid "[@yzim](https://github.com/yzim)" +#~ msgstr "[@yzim](https://github.com/yzim)" + +#~ msgid "" +#~ "[aaf701b](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/4153a5091b698c2270d160409e7fee73baaf701b)" +#~ msgstr "" +#~ "[aaf701b](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/4153a5091b698c2270d160409e7fee73baaf701b)" + +#~ msgid "66" +#~ msgstr "66" + +#~ msgid "[@Yuxiao-Xu](https://github.com/Yuxiao-Xu)" +#~ msgstr "[@Yuxiao-Xu](https://github.com/Yuxiao-Xu)" + +#~ msgid "2025/6/9" +#~ msgstr "2025/6/9" + +#~ msgid "" +#~ "[6b853f1](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/6b853f15fe69ba335d2745ebcf14a164d0bcc505)" +#~ msgstr "" +#~ "[6b853f1](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/6b853f15fe69ba335d2745ebcf14a164d0bcc505)" + +#~ msgid "65" +#~ msgstr "65" + +#~ msgid "[@ChenTaoyu-SJTU](https://github.com/ChenTaoyu-SJTU)" +#~ msgstr "[@ChenTaoyu-SJTU](https://github.com/ChenTaoyu-SJTU)" + +#~ msgid "2025/6/7" +#~ msgstr "2025/6/7" + +#~ msgid "" +#~ "[20dedba](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/20dedba5d1fc84b7ae8b49f9ce3e3649389e2193)" +#~ msgstr "" +#~ "[20dedba](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/20dedba5d1fc84b7ae8b49f9ce3e3649389e2193)" + +#~ msgid "64" +#~ msgstr "64" + +#~ msgid "[@zxdukki](https://github.com/zxdukki)" +#~ msgstr "[@zxdukki](https://github.com/zxdukki)" + +#~ msgid "" +#~ "[87ebaef](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/87ebaef4e4e519988f27a6aa378f614642202ecf)" +#~ msgstr "" +#~ "[87ebaef](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/87ebaef4e4e519988f27a6aa378f614642202ecf)" + +#~ msgid "63" +#~ msgstr "63" + +#~ msgid "[@sdmyzlp](https://github.com/sdmyzlp)" +#~ msgstr "[@sdmyzlp](https://github.com/sdmyzlp)" + +#~ msgid "" +#~ "[3640c60](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/3640c60b0eb4d4cb104e20bfa406d3f1d17920a7)" +#~ msgstr "" +#~ "[3640c60](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/3640c60b0eb4d4cb104e20bfa406d3f1d17920a7)" + +#~ msgid "62" +#~ msgstr "62" + +#~ msgid "[@weijinqian0](https://github.com/weijinqian0)" +#~ msgstr "[@weijinqian0](https://github.com/weijinqian0)" + +#~ msgid "" +#~ "[e9ada68](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/e9ada685ece798f9fe0d4a287e3f5246a8a7207b)" +#~ msgstr "" +#~ "[e9ada68](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/e9ada685ece798f9fe0d4a287e3f5246a8a7207b)" + +#~ msgid "61" +#~ msgstr "61" + +#~ msgid "[@hahazhky](https://github.com/hahazhky)" +#~ msgstr "[@hahazhky](https://github.com/hahazhky)" + +#~ msgid "2025/6/6" +#~ msgstr "2025/6/6" + +#~ msgid "" +#~ "[0b12c2a](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/0b12c2acf7d9fd192beebebf662298067d9a5435)" +#~ msgstr "" +#~ "[0b12c2a](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/0b12c2acf7d9fd192beebebf662298067d9a5435)" + +#~ msgid "60" +#~ msgstr "60" + +#~ msgid "[@depeng1994](https://github.com/depeng1994)" +#~ msgstr "[@depeng1994](https://github.com/depeng1994)" + +#~ msgid "" +#~ "[6b094a2](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/6b094a2bd49a8a41eb3647568b2d9e5b337db81f)" +#~ msgstr "" +#~ "[6b094a2](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/6b094a2bd49a8a41eb3647568b2d9e5b337db81f)" + +#~ msgid "59" +#~ msgstr "59" + +#~ msgid "[@David9857](https://github.com/David9857)" +#~ msgstr "[@David9857](https://github.com/David9857)" + +#~ msgid "2025/6/5" +#~ msgstr "2025/6/5" + +#~ msgid "" +#~ "[78431b3](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/78431b34694dfa3c8f54ed7cc626660318557927)" +#~ msgstr "" +#~ "[78431b3](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/78431b34694dfa3c8f54ed7cc626660318557927)" + +#~ msgid "58" +#~ msgstr "58" + +#~ msgid "[@momo609](https://github.com/momo609)" +#~ msgstr "[@momo609](https://github.com/momo609)" + +#~ msgid "" +#~ "[908a851](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/908a851a776cfd9051cc062119e6ec481561c6f7)" +#~ msgstr "" +#~ "[908a851](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/908a851a776cfd9051cc062119e6ec481561c6f7)" + +#~ msgid "57" +#~ msgstr "57" + +#~ msgid "[@zhangxinyuehfad](https://github.com/zhangxinyuehfad)" +#~ msgstr "[@zhangxinyuehfad](https://github.com/zhangxinyuehfad)" + +#~ msgid "" +#~ "[7737aaa](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/7737aaa40f699b233a35fb61e908b687adc1e2e5)" +#~ msgstr "" +#~ "[7737aaa](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/7737aaa40f699b233a35fb61e908b687adc1e2e5)" + +#~ msgid "56" +#~ msgstr "56" + +#~ msgid "[@NINGBENZHE](https://github.com/NINGBENZHE)" +#~ msgstr "[@NINGBENZHE](https://github.com/NINGBENZHE)" + +#~ msgid "2025/6/3" +#~ msgstr "2025/6/3" + +#~ msgid "" +#~ "[6ec64a3](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/6ec64a3f9686df65b5a23a41aa301e669db19099)" +#~ msgstr "" +#~ "[6ec64a3](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/6ec64a3f9686df65b5a23a41aa301e669db19099)" + +#~ msgid "55" +#~ msgstr "55" + +#~ msgid "[@XWFAlone](https://github.com/XWFAlone)" +#~ msgstr "[@XWFAlone](https://github.com/XWFAlone)" + +#~ msgid "2025/5/30" +#~ msgstr "2025/5/30" + +#~ msgid "" +#~ "[3442fbd](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/3442fbdb235b4c6d72c2bc64a49707a7bd89958e)" +#~ msgstr "" +#~ "[3442fbd](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/3442fbdb235b4c6d72c2bc64a49707a7bd89958e)" + +#~ msgid "54" +#~ msgstr "54" + +#~ msgid "[@YisongJiang](https://github.com/YisongJiang)" +#~ msgstr "[@YisongJiang](https://github.com/YisongJiang)" + +#~ msgid "2025/5/29" +#~ msgstr "2025/5/29" + +#~ msgid "" +#~ "[90afaf6](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/90afaf6306f680307462becf3c78585737579851)" +#~ msgstr "" +#~ "[90afaf6](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/90afaf6306f680307462becf3c78585737579851)" + +#~ msgid "53" +#~ msgstr "53" + +#~ msgid "[@ponix-j](https://github.com/ponix-j)" +#~ msgstr "[@ponix-j](https://github.com/ponix-j)" + +#~ msgid "2025/5/23" +#~ msgstr "2025/5/23" + +#~ msgid "" +#~ "[df58fb8](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/df58fb80eee24139fc61c495be3ce79cf81b3f73)" +#~ msgstr "" +#~ "[df58fb8](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/df58fb80eee24139fc61c495be3ce79cf81b3f73)" + +#~ msgid "52" +#~ msgstr "52" + +#~ msgid "[@ttanzhiqiang](https://github.com/ttanzhiqiang)" +#~ msgstr "[@ttanzhiqiang](https://github.com/ttanzhiqiang)" + +#~ msgid "" +#~ "[dc6172e](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/dc6172efd3860ce95b40a7b3e93611f875f06d40)" +#~ msgstr "" +#~ "[dc6172e](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/dc6172efd3860ce95b40a7b3e93611f875f06d40)" + +#~ msgid "51" +#~ msgstr "51" + +#~ msgid "[@yangpuPKU](https://github.com/yangpuPKU)" +#~ msgstr "[@yangpuPKU](https://github.com/yangpuPKU)" + +#~ msgid "" +#~ "[46df67a](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/46df67a5e9ab73fade08cbb2d8c0155cee7316d1)" +#~ msgstr "" +#~ "[46df67a](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/46df67a5e9ab73fade08cbb2d8c0155cee7316d1)" + +#~ msgid "50" +#~ msgstr "50" + +#~ msgid "[@wonderful199082](https://github.com/wonderful199082)" +#~ msgstr "[@wonderful199082](https://github.com/wonderful199082)" + +#~ msgid "2025/5/20" +#~ msgstr "2025/5/20" + +#~ msgid "" +#~ "[5cf9ff1](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/5cf9ff18e91b0b7031c258d71a257b8e24689763)" +#~ msgstr "" +#~ "[5cf9ff1](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/5cf9ff18e91b0b7031c258d71a257b8e24689763)" + +#~ msgid "49" +#~ msgstr "49" + +#~ msgid "[@22dimensions](https://github.com/22dimensions)" +#~ msgstr "[@22dimensions](https://github.com/22dimensions)" + +#~ msgid "2025/5/17" +#~ msgstr "2025/5/17" + +#~ msgid "" +#~ "[a8730e7](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/a8730e7a3c4ac6c4b39a5946c943252fdea6cce5)" +#~ msgstr "" +#~ "[a8730e7](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/a8730e7a3c4ac6c4b39a5946c943252fdea6cce5)" + +#~ msgid "48" +#~ msgstr "48" + +#~ msgid "[@cxcxflying](https://github.com/cxcxflying)" +#~ msgstr "[@cxcxflying](https://github.com/cxcxflying)" + +#~ msgid "2025/5/13" +#~ msgstr "2025/5/13" + +#~ msgid "" +#~ "[e564470](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/e56447033889ca95df512208cab22ef832bfdf07)" +#~ msgstr "" +#~ "[e564470](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/e56447033889ca95df512208cab22ef832bfdf07)" + +#~ msgid "47" +#~ msgstr "47" + +#~ msgid "[@NeverRaR](https://github.com/NeverRaR)" +#~ msgstr "[@NeverRaR](https://github.com/NeverRaR)" + +#~ msgid "2025/5/12" +#~ msgstr "2025/5/12" + +#~ msgid "" +#~ "[efabd72](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/efabd722eb757e49aa309c173bbec91ca8c4ced1)" +#~ msgstr "" +#~ "[efabd72](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/efabd722eb757e49aa309c173bbec91ca8c4ced1)" + +#~ msgid "46" +#~ msgstr "46" + +#~ msgid "[@chris668899](https://github.com/chris668899)" +#~ msgstr "[@chris668899](https://github.com/chris668899)" + +#~ msgid "2025/5/8" +#~ msgstr "2025/5/8" + +#~ msgid "" +#~ "[6c02088](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/6c020883a8332b5c519f4f6502733edd9b391c2b)" +#~ msgstr "" +#~ "[6c02088](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/6c020883a8332b5c519f4f6502733edd9b391c2b)" + +#~ msgid "45" +#~ msgstr "45" + +#~ msgid "[@sunbaosong](https://github.com/sunbaosong)" +#~ msgstr "[@sunbaosong](https://github.com/sunbaosong)" + +#~ msgid "2025/5/6" +#~ msgstr "2025/5/6" + +#~ msgid "" +#~ "[d6bfae8](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/d6bfae8eeebedf677b643b712d367a3a69c9cce4)" +#~ msgstr "" +#~ "[d6bfae8](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/d6bfae8eeebedf677b643b712d367a3a69c9cce4)" + +#~ msgid "44" +#~ msgstr "44" + +#~ msgid "[@ApsarasX](https://github.com/ApsarasX)" +#~ msgstr "[@ApsarasX](https://github.com/ApsarasX)" + +#~ msgid "2025/4/29" +#~ msgstr "2025/4/29" + +#~ msgid "" +#~ "[87975fa](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/87975fa058fe3f90d204ded42a08989a8dcb413e)" +#~ msgstr "" +#~ "[87975fa](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/87975fa058fe3f90d204ded42a08989a8dcb413e)" + +#~ msgid "43" +#~ msgstr "43" + +#~ msgid "[@zouyida2052](https://github.com/zouyida2052)" +#~ msgstr "[@zouyida2052](https://github.com/zouyida2052)" + +#~ msgid "2025/4/28" +#~ msgstr "2025/4/28" + +#~ msgid "" +#~ "[b9528e6](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/b9528e6ecdc417cf444e55a0ce4a2bafdef0ea3b)" +#~ msgstr "" +#~ "[b9528e6](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/b9528e6ecdc417cf444e55a0ce4a2bafdef0ea3b)" + +#~ msgid "42" +#~ msgstr "42" + +#~ msgid "[@ZhengJun9](https://github.com/ZhengJun9)" +#~ msgstr "[@ZhengJun9](https://github.com/ZhengJun9)" + +#~ msgid "" +#~ "[1791113](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/17911138c90d78a76bd691e9dcb56763db35b19f)" +#~ msgstr "" +#~ "[1791113](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/17911138c90d78a76bd691e9dcb56763db35b19f)" + +#~ msgid "41" +#~ msgstr "41" + +#~ msgid "[@linfeng-yuan](https://github.com/linfeng-yuan)" +#~ msgstr "[@linfeng-yuan](https://github.com/linfeng-yuan)" + +#~ msgid "" +#~ "[2204e4d](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/2204e4d08f8e10cf9c30154a14eaa5ca956c2acd)" +#~ msgstr "" +#~ "[2204e4d](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/2204e4d08f8e10cf9c30154a14eaa5ca956c2acd)" + +#~ msgid "40" +#~ msgstr "40" + +#~ msgid "2025/4/27" +#~ msgstr "2025/4/27" + +#~ msgid "" +#~ "[fa4a5d9](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/fa4a5d980e8845a88b9162cf169f0a5ab230f8a5)" +#~ msgstr "" +#~ "[fa4a5d9](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/fa4a5d980e8845a88b9162cf169f0a5ab230f8a5)" + +#~ msgid "39" +#~ msgstr "39" + +#~ msgid "[@fakeYan](https://github.com/fakeYan)" +#~ msgstr "[@fakeYan](https://github.com/fakeYan)" + +#~ msgid "2025/4/23" +#~ msgstr "2025/4/23" + +#~ msgid "" +#~ "[05bdcbe](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/05bdcbeae47c7fcb9b1c30cad059abf1d40b5421)" +#~ msgstr "" +#~ "[05bdcbe](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/05bdcbeae47c7fcb9b1c30cad059abf1d40b5421)" + +#~ msgid "38" +#~ msgstr "38" + +#~ msgid "[@RongRongStudio](https://github.com/RongRongStudio)" +#~ msgstr "[@RongRongStudio](https://github.com/RongRongStudio)" + +#~ msgid "2025/4/22" +#~ msgstr "2025/4/22" + +#~ msgid "" +#~ "[848e041](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/848e041a54732c923660dd02daf8e9bf439736a2)" +#~ msgstr "" +#~ "[848e041](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/848e041a54732c923660dd02daf8e9bf439736a2)" + +#~ msgid "37" +#~ msgstr "37" + +#~ msgid "[@paulyu12](https://github.com/paulyu12)" +#~ msgstr "[@paulyu12](https://github.com/paulyu12)" + +#~ msgid "2025/4/17" +#~ msgstr "2025/4/17" + +#~ msgid "" +#~ "[697908f](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/697908f5cd7c65a3a917ec1a962b0886efc98c7e)" +#~ msgstr "" +#~ "[697908f](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/697908f5cd7c65a3a917ec1a962b0886efc98c7e)" + +#~ msgid "36" +#~ msgstr "36" + +#~ msgid "[@heartStrive1998](https://github.com/heartStrive1998)" +#~ msgstr "[@heartStrive1998](https://github.com/heartStrive1998)" + +#~ msgid "2025/4/16" +#~ msgstr "2025/4/16" + +#~ msgid "" +#~ "[2f15503](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/2f155039dc3997640854daef469bbf0cb77dc6ed)" +#~ msgstr "" +#~ "[2f15503](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/2f155039dc3997640854daef469bbf0cb77dc6ed)" + +#~ msgid "35" +#~ msgstr "35" + +#~ msgid "[@eeethenQ](https://github.com/eeethenQ)" +#~ msgstr "[@eeethenQ](https://github.com/eeethenQ)" + +#~ msgid "2025/4/15" +#~ msgstr "2025/4/15" + +#~ msgid "" +#~ "[44a8301](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/44a8301424ded94dae83e13b837f5bfc0a1bfc15)" +#~ msgstr "" +#~ "[44a8301](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/44a8301424ded94dae83e13b837f5bfc0a1bfc15)" + +#~ msgid "34" +#~ msgstr "34" + +#~ msgid "[@wxsIcey](https://github.com/wxsIcey)" +#~ msgstr "[@wxsIcey](https://github.com/wxsIcey)" + +#~ msgid "2025/4/10" +#~ msgstr "2025/4/10" + +#~ msgid "" +#~ "[d05ea17](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/d05ea17427b82a506b97409a7de8359f18f565f7)" +#~ msgstr "" +#~ "[d05ea17](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/d05ea17427b82a506b97409a7de8359f18f565f7)" + +#~ msgid "33" +#~ msgstr "33" + +#~ msgid "[@yx0716](https://github.com/yx0716)" +#~ msgstr "[@yx0716](https://github.com/yx0716)" + +#~ msgid "2025/4/8" +#~ msgstr "2025/4/8" + +#~ msgid "" +#~ "[5d62393](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/5d6239306be9b0f5ac6dbaa137048c372a92ff20)" +#~ msgstr "" +#~ "[5d62393](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/5d6239306be9b0f5ac6dbaa137048c372a92ff20)" + +#~ msgid "32" +#~ msgstr "32" + +#~ msgid "[@celestialli](https://github.com/celestialli)" +#~ msgstr "[@celestialli](https://github.com/celestialli)" + +#~ msgid "2025/4/7" +#~ msgstr "2025/4/7" + +#~ msgid "" +#~ "[2b765dc](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/2b765dcc4974b1bafc26ff5da817ce7e652f0eb0)" +#~ msgstr "" +#~ "[2b765dc](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/2b765dcc4974b1bafc26ff5da817ce7e652f0eb0)" + +#~ msgid "31" +#~ msgstr "31" + +#~ msgid "[@hfadzxy](https://github.com/hfadzxy)" +#~ msgstr "[@hfadzxy](https://github.com/hfadzxy)" + +#~ msgid "2025/3/30" +#~ msgstr "2025/3/30" + +#~ msgid "" +#~ "[7beb433](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/7beb4339dc8047af9ef64db1d0a8c59ddbb3709f)" +#~ msgstr "" +#~ "[7beb433](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/7beb4339dc8047af9ef64db1d0a8c59ddbb3709f)" + +#~ msgid "30" +#~ msgstr "30" + +#~ msgid "[@wuhuikx](https://github.com/wuhuikx)" +#~ msgstr "[@wuhuikx](https://github.com/wuhuikx)" + +#~ msgid "2025/3/28" +#~ msgstr "2025/3/28" + +#~ msgid "" +#~ "[57a84bb](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/57a84bb7befeaa0dc62aa35fa406e4d6affbfcca)" +#~ msgstr "" +#~ "[57a84bb](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/57a84bb7befeaa0dc62aa35fa406e4d6affbfcca)" + +#~ msgid "29" +#~ msgstr "29" + +#~ msgid "[@zzzzwwjj](https://github.com/zzzzwwjj)" +#~ msgstr "[@zzzzwwjj](https://github.com/zzzzwwjj)" + +#~ msgid "" +#~ "[12390af](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/12390af075962456ecc8233d8dcce7064b75f390)" +#~ msgstr "" +#~ "[12390af](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/12390af075962456ecc8233d8dcce7064b75f390)" + +#~ msgid "28" +#~ msgstr "28" + +#~ msgid "" +#~ "[27e86b9](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/27e86b993a6a810d818143ec9dbfc439a419fa77)" +#~ msgstr "" +#~ "[27e86b9](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/27e86b993a6a810d818143ec9dbfc439a419fa77)" + +#~ msgid "27" +#~ msgstr "27" + +#~ msgid "[@ZhengZhenyu](https://github.com/ZhengZhenyu)" +#~ msgstr "[@ZhengZhenyu](https://github.com/ZhengZhenyu)" + +#~ msgid "2025/3/26" +#~ msgstr "2025/3/26" + +#~ msgid "" +#~ "[0b5a964](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/0b5a9643fd6c3240d7ede669e37209d7ff433841)" +#~ msgstr "" +#~ "[0b5a964](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/0b5a9643fd6c3240d7ede669e37209d7ff433841)" + +#~ msgid "26" +#~ msgstr "26" + +#~ msgid "[@baifanxxx](https://github.com/baifanxxx)" +#~ msgstr "[@baifanxxx](https://github.com/baifanxxx)" + +#~ msgid "" +#~ "[1225052](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/122505208ff6284f409846ca7294f4a4b9883285)" +#~ msgstr "" +#~ "[1225052](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/122505208ff6284f409846ca7294f4a4b9883285)" + +#~ msgid "25" +#~ msgstr "25" + +#~ msgid "[@rjg-lyh](https://github.com/rjg-lyh)" +#~ msgstr "[@rjg-lyh](https://github.com/rjg-lyh)" + +#~ msgid "2025/3/13" +#~ msgstr "2025/3/13" + +#~ msgid "" +#~ "[6512470](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/65124705fb39d4cc2c94c80254421e067a82fe50)" +#~ msgstr "" +#~ "[6512470](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/65124705fb39d4cc2c94c80254421e067a82fe50)" + +#~ msgid "24" +#~ msgstr "24" + +#~ msgid "[@xiemingda-1002](https://github.com/xiemingda-1002)" +#~ msgstr "[@xiemingda-1002](https://github.com/xiemingda-1002)" + +#~ msgid "2025/3/12" +#~ msgstr "2025/3/12" + +#~ msgid "" +#~ "[59ea23d](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/59ea23d0d394879d7f33de6fd22242539b9c3cc5)" +#~ msgstr "" +#~ "[59ea23d](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/59ea23d0d394879d7f33de6fd22242539b9c3cc5)" + +#~ msgid "23" +#~ msgstr "23" + +#~ msgid "[@yiz-liu](https://github.com/yiz-liu)" +#~ msgstr "[@yiz-liu](https://github.com/yiz-liu)" + +#~ msgid "2025/3/11" +#~ msgstr "2025/3/11" + +#~ msgid "" +#~ "[0db6670](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/0db6670bfab8cb1d84c9e7270df0a1d42d6ce7ca)" +#~ msgstr "" +#~ "[0db6670](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/0db6670bfab8cb1d84c9e7270df0a1d42d6ce7ca)" + +#~ msgid "22" +#~ msgstr "22" + +#~ msgid "[@new-TonyWang](https://github.com/new-TonyWang)" +#~ msgstr "[@new-TonyWang](https://github.com/new-TonyWang)" + +#~ msgid "" +#~ "[dfb4e23](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/dfb4e23e9d820ac992a071c123bbe983c7b01b2e)" +#~ msgstr "" +#~ "[dfb4e23](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/dfb4e23e9d820ac992a071c123bbe983c7b01b2e)" + +#~ msgid "21" +#~ msgstr "21" + +#~ msgid "[@mengwei805](https://github.com/mengwei805)" +#~ msgstr "[@mengwei805](https://github.com/mengwei805)" + +#~ msgid "2025/3/6" +#~ msgstr "2025/3/6" + +#~ msgid "" +#~ "[8fcf3d1](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/8fcf3d1704084626db35c5dc82ade446508598d4)" +#~ msgstr "" +#~ "[8fcf3d1](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/8fcf3d1704084626db35c5dc82ade446508598d4)" + +#~ msgid "20" +#~ msgstr "20" + +#~ msgid "[@baymax591](https://github.com/baymax591)" +#~ msgstr "[@baymax591](https://github.com/baymax591)" + +#~ msgid "2025/2/28" +#~ msgstr "2025/2/28" + +#~ msgid "" +#~ "[e8131b9](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/e8131b99cf199f50a304e6e6fb125a1b95bcc92b)" +#~ msgstr "" +#~ "[e8131b9](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/e8131b99cf199f50a304e6e6fb125a1b95bcc92b)" + +#~ msgid "19" +#~ msgstr "19" + +#~ msgid "[@dependabot](https://github.com/dependabot)" +#~ msgstr "[@dependabot](https://github.com/dependabot)" + +#~ msgid "2025/2/27" +#~ msgstr "2025/2/27" + +#~ msgid "" +#~ "[a5564ed](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/a5564ed5d8fd9818936a22d9ea35951a27513b4c)" +#~ msgstr "" +#~ "[a5564ed](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/a5564ed5d8fd9818936a22d9ea35951a27513b4c)" + +#~ msgid "18" +#~ msgstr "18" + +#~ msgid "[@shink](https://github.com/shink)" +#~ msgstr "[@shink](https://github.com/shink)" + +#~ msgid "" +#~ "[6aed833](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/6aed83335cbe92fd0b8ef07c28966a753d012ccb)" +#~ msgstr "" +#~ "[6aed833](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/6aed83335cbe92fd0b8ef07c28966a753d012ccb)" + +#~ msgid "17" +#~ msgstr "17" + +#~ msgid "[@wwfu109](https://github.com/wwfu109)" +#~ msgstr "[@wwfu109](https://github.com/wwfu109)" + +#~ msgid "" +#~ "[b074047](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/b07404766bdaf6e3cebc5cb0aba89a247501302e)" +#~ msgstr "" +#~ "[b074047](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/b07404766bdaf6e3cebc5cb0aba89a247501302e)" + +#~ msgid "16" +#~ msgstr "16" + +#~ msgid "[@kunpengW-code](https://github.com/kunpengW-code)" +#~ msgstr "[@kunpengW-code](https://github.com/kunpengW-code)" + +#~ msgid "2025/2/26" +#~ msgstr "2025/2/26" + +#~ msgid "" +#~ "[ca807ce](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/ca807ce49ed64aa89242f5ae29b9862a77648b45)" +#~ msgstr "" +#~ "[ca807ce](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/ca807ce49ed64aa89242f5ae29b9862a77648b45)" + +#~ msgid "15" +#~ msgstr "15" + +#~ msgid "[@Yaphets24](https://github.com/Yaphets24)" +#~ msgstr "[@Yaphets24](https://github.com/Yaphets24)" + +#~ msgid "2025/2/22" +#~ msgstr "2025/2/22" + +#~ msgid "" +#~ "[d0b3cb4](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/d0b3cb4fa79d5fc7f8245a3c68885ce1fa030ba4)" +#~ msgstr "" +#~ "[d0b3cb4](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/d0b3cb4fa79d5fc7f8245a3c68885ce1fa030ba4)" + +#~ msgid "14" +#~ msgstr "14" + +#~ msgid "[@noemotiovon](https://github.com/noemotiovon)" +#~ msgstr "[@noemotiovon](https://github.com/noemotiovon)" + +#~ msgid "2025/2/21" +#~ msgstr "2025/2/21" + +#~ msgid "" +#~ "[202b39a](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/202b39a38c2869b0ecc3df486550fb555a2eb0c0)" +#~ msgstr "" +#~ "[202b39a](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/202b39a38c2869b0ecc3df486550fb555a2eb0c0)" + +#~ msgid "13" +#~ msgstr "13" + +#~ msgid "[@SidaoY](https://github.com/SidaoY)" +#~ msgstr "[@SidaoY](https://github.com/SidaoY)" + +#~ msgid "2025/2/18" +#~ msgstr "2025/2/18" + +#~ msgid "" +#~ "[718c763](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/718c7638555d12cd43ea2a9e497e185778b68595)" +#~ msgstr "" +#~ "[718c763](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/718c7638555d12cd43ea2a9e497e185778b68595)" + +#~ msgid "12" +#~ msgstr "12" + +#~ msgid "[@ShiyaNiu](https://github.com/ShiyaNiu)" +#~ msgstr "[@ShiyaNiu](https://github.com/ShiyaNiu)" + +#~ msgid "2025/2/17" +#~ msgstr "2025/2/17" + +#~ msgid "" +#~ "[36ea38f](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/36ea38fde56437ff1745bd95cd8d9e02a6578d38)" +#~ msgstr "" +#~ "[36ea38f](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/36ea38fde56437ff1745bd95cd8d9e02a6578d38)" + +#~ msgid "11" +#~ msgstr "11" + +#~ msgid "[@ji-huazhong](https://github.com/ji-huazhong)" +#~ msgstr "[@ji-huazhong](https://github.com/ji-huazhong)" + +#~ msgid "2025/2/12" +#~ msgstr "2025/2/12" + +#~ msgid "" +#~ "[c8b57d1](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/c8b57d10b24efcd9b4fadeb66cfbf66aa3dd5f82)" +#~ msgstr "" +#~ "[c8b57d1](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/c8b57d10b24efcd9b4fadeb66cfbf66aa3dd5f82)" + +#~ msgid "10" +#~ msgstr "10" + +#~ msgid "[@Angazenn](https://github.com/Angazenn)" +#~ msgstr "[@Angazenn](https://github.com/Angazenn)" + +#~ msgid "2025/2/11" +#~ msgstr "2025/2/11" + +#~ msgid "" +#~ "[7637759](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/7637759056028839c74960d9cfd3ce6275ee5d35)" +#~ msgstr "" +#~ "[7637759](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/7637759056028839c74960d9cfd3ce6275ee5d35)" + +#~ msgid "9" +#~ msgstr "9" + +#~ msgid "[@whx-sjtu](https://github.com/whx-sjtu)" +#~ msgstr "[@whx-sjtu](https://github.com/whx-sjtu)" + +#~ msgid "2025/2/7" +#~ msgstr "2025/2/7" + +#~ msgid "" +#~ "[8fc5dc9](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/8fc5dc966aaf4e174d1ec0d1902c40289411ec0e)" +#~ msgstr "" +#~ "[8fc5dc9](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/8fc5dc966aaf4e174d1ec0d1902c40289411ec0e)" + +#~ msgid "8" +#~ msgstr "8" + +#~ msgid "[@zouyida2002](https://github.com/zouyida2002)" +#~ msgstr "[@zouyida2002](https://github.com/zouyida2002)" + +#~ msgid "" +#~ "[4495fc6](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/4495fc68389e3fb1ef14534c202948931e38446b)" +#~ msgstr "" +#~ "[4495fc6](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/4495fc68389e3fb1ef14534c202948931e38446b)" + +#~ msgid "7" +#~ msgstr "7" + +#~ msgid "[@hw_whx](https://github.com/hw_whx)" +#~ msgstr "[@hw_whx](https://github.com/hw_whx)" + +#~ msgid "" +#~ "[7d16772](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/7d1677263bc6628ade33bb780455e0f6e5b9b27a)" +#~ msgstr "" +#~ "[7d16772](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/7d1677263bc6628ade33bb780455e0f6e5b9b27a)" + +#~ msgid "6" +#~ msgstr "6" + +#~ msgid "[@MengqingCao](https://github.com/MengqingCao)" +#~ msgstr "[@MengqingCao](https://github.com/MengqingCao)" + +#~ msgid "2025/2/6" +#~ msgstr "2025/2/6" + +#~ msgid "" +#~ "[7d9ae22](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/7d9ae22ecb6dc3ea4e720e5109cf46e1ae7da730)" +#~ msgstr "" +#~ "[7d9ae22](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/7d9ae22ecb6dc3ea4e720e5109cf46e1ae7da730)" + +#~ msgid "5" +#~ msgstr "5" + +#~ msgid "[@Potabk](https://github.com/Potabk)" +#~ msgstr "[@Potabk](https://github.com/Potabk)" + +#~ msgid "" +#~ "[8cb5615](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/8cb5615fb010b34c2f4f89e03e6257bfee851f86)" +#~ msgstr "" +#~ "[8cb5615](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/8cb5615fb010b34c2f4f89e03e6257bfee851f86)" + +#~ msgid "4" +#~ msgstr "4" + +#~ msgid "" +#~ "[a48b9ad](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/a48b9addefd292af523644411d4ff4142dd4bc66)" +#~ msgstr "" +#~ "[a48b9ad](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/a48b9addefd292af523644411d4ff4142dd4bc66)" + +#~ msgid "3" +#~ msgstr "3" + +#~ msgid "[@shen-shanshan](https://github.com/shen-shanshan)" +#~ msgstr "[@shen-shanshan](https://github.com/shen-shanshan)" + +#~ msgid "" +#~ "[bfccf73](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/bfccf739e2fe121b54d9b198c2ec205a9379190e)" +#~ msgstr "" +#~ "[bfccf73](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/bfccf739e2fe121b54d9b198c2ec205a9379190e)" + +#~ msgid "2" +#~ msgstr "2" + +#~ msgid "2025/2/5" +#~ msgstr "2025/2/5" + +#~ msgid "" +#~ "[d5e7756](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/d5e7756028bd5884ade96b654555c375770a2f64)" +#~ msgstr "" +#~ "[d5e7756](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/d5e7756028bd5884ade96b654555c375770a2f64)" + +#~ msgid "1" +#~ msgstr "1" + +#~ msgid "[@simon-mo](https://github.com/simon-mo)" +#~ msgstr "[@simon-mo](https://github.com/simon-mo)" + +#~ msgid "2025/1/29" +#~ msgstr "2025/1/29" + +#~ msgid "" +#~ "[eb28342](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/eb283428ddc17207b6866118f9bc15454b5b8801)" +#~ msgstr "" +#~ "[eb28342](https://github.com/vllm-project/vllm-" +#~ "kunlun/commit/eb283428ddc17207b6866118f9bc15454b5b8801)" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/community/governance.po b/docs/source/locale/zh_CN/LC_MESSAGES/community/governance.po new file mode 100644 index 0000000..6aacea0 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/community/governance.po @@ -0,0 +1,228 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/community/governance.md:1 +msgid "Governance" +msgstr "治理" + +#: ../../source/community/governance.md:3 +msgid "Mission" +msgstr "使命" + +#~ msgid "" +#~ "As a vital component of vLLM, the" +#~ " vLLM Kunlun project is dedicated to" +#~ " providing an easy, fast, and cheap" +#~ " LLM Serving for Everyone on Kunlun" +#~ " XPU, and to actively contribute to" +#~ " the enrichment of vLLM." +#~ msgstr "" +#~ "作为 vLLM 的重要组成部分,vLLM Kunlun 项目致力于为所有人在 " +#~ "Kunlun XPU 上提供简单、快速且低成本的大语言模型服务,并积极促进 vLLM " +#~ "的丰富发展。" + +#~ msgid "Principles" +#~ msgstr "原则" + +#~ msgid "" +#~ "vLLM Kunlun follows the vLLM community's" +#~ " code of conduct:[vLLM - CODE OF " +#~ "CONDUCT](https://github.com/vllm-" +#~ "project/vllm/blob/main/CODE_OF_CONDUCT.md)" +#~ msgstr "" +#~ "vLLM Kunlun 遵循 vLLM 社区的行为准则:[vLLM - " +#~ "行为准则](https://github.com/vllm-" +#~ "project/vllm/blob/main/CODE_OF_CONDUCT.md)" + +#~ msgid "Governance - Mechanics" +#~ msgstr "治理 - 机制" + +#~ msgid "" +#~ "vLLM Kunlun is an open-source " +#~ "project under the vLLM community, where" +#~ " the authority to appoint roles is" +#~ " ultimately determined by the vLLM " +#~ "community. It adopts a hierarchical " +#~ "technical governance structure." +#~ msgstr "vLLM Kunlun 是 vLLM 社区下的一个开源项目,其角色任命权最终由 vLLM 社区决定。它采用分层的技术治理结构。" + +#~ msgid "Contributor:" +#~ msgstr "贡献者:" + +#~ msgid "" +#~ "**Responsibility:** Help new contributors on" +#~ " boarding, handle and respond to " +#~ "community questions, review RFCs, code" +#~ msgstr "**职责:** 帮助新贡献者加入,处理和回复社区问题,审查RFC和代码" + +#~ msgid "" +#~ "**Requirements:** Complete at least 1 " +#~ "contribution. Contributor is someone who " +#~ "consistently and actively participates in " +#~ "a project, included but not limited " +#~ "to issue/review/commits/community involvement." +#~ msgstr "**要求:** 完成至少1次贡献。贡献者是指持续且积极参与项目的人,包括但不限于问题、评审、提交和社区参与。" + +#~ msgid "" +#~ "Contributors will be empowered [vllm-" +#~ "project/vllm-kunlun](https://github.com/vllm-project" +#~ "/vllm-kunlun) Github repo `Triage` " +#~ "permissions (`Can read and clone this" +#~ " repository. Can also manage issues " +#~ "and pull requests`) to help community" +#~ " developers collaborate more efficiently." +#~ msgstr "" +#~ "贡献者将被赋予 [vllm-project/vllm-" +#~ "kunlun](https://github.com/vllm-project/vllm-kunlun) " +#~ "Github 仓库的 `Triage` " +#~ "权限(`可读取和克隆此仓库。还可以管理问题和拉取请求`),以帮助社区开发者更加高效地协作。" + +#~ msgid "Maintainer:" +#~ msgstr "维护者:" + +#~ msgid "" +#~ "**Responsibility:** Develop the project's " +#~ "vision and mission. Maintainers are " +#~ "responsible for driving the technical " +#~ "direction of the entire project and " +#~ "ensuring its overall success, possessing " +#~ "code merge permissions. They formulate " +#~ "the roadmap, review contributions from " +#~ "community members, continuously contribute " +#~ "code, and actively engage in community" +#~ " activities (such as regular " +#~ "meetings/events)." +#~ msgstr "" +#~ "**责任:** " +#~ "制定项目的愿景和使命。维护者负责引领整个项目的技术方向并确保其整体成功,拥有代码合并权限。他们制定路线图,审核社区成员的贡献,持续贡献代码,并积极参与社区活动(如定期会议/活动)。" + +#~ msgid "" +#~ "**Requirements:** Deep understanding of ‌vLLM‌" +#~ " and ‌vLLM Kunlun‌ codebases, with a" +#~ " commitment to sustained code " +#~ "contributions. Competency in ‌design/development/PR" +#~ " review workflows‌." +#~ msgstr "" +#~ "**要求:** 深入理解 ‌vLLM‌ 和 ‌vLLM Kunlun‌ " +#~ "代码库,并承诺持续贡献代码。具备 ‌设计/开发/PR 审核流程‌ 的能力。" + +#~ msgid "" +#~ "**Review Quality‌:** Actively participate in" +#~ " community code reviews, ensuring high-" +#~ "quality code integration." +#~ msgstr "**评审质量:** 积极参与社区代码评审,确保高质量的代码集成。" + +#~ msgid "" +#~ "**Quality Contribution‌:** Successfully develop " +#~ "and deliver at least one major " +#~ "feature while maintaining consistent high-" +#~ "quality contributions." +#~ msgstr "**质量贡献‌:** 成功开发并交付至少一个主要功能,同时持续保持高质量的贡献。" + +#~ msgid "" +#~ "**Community Involvement‌:** Actively address " +#~ "issues, respond to forum inquiries, " +#~ "participate in discussions, and engage " +#~ "in community-driven tasks." +#~ msgstr "**社区参与:** 积极解决问题,回复论坛询问,参与讨论,并参与社区驱动的任务。" + +#~ msgid "" +#~ "Requires approval from existing Maintainers." +#~ " The vLLM community has the final " +#~ "decision-making authority." +#~ msgstr "需要现有维护者的批准。vLLM社区拥有最终决策权。" + +#~ msgid "" +#~ "Maintainer will be empowered [vllm-" +#~ "project/vllm-kunlun](https://github.com/vllm-project" +#~ "/vllm-kunlun) Github repo write permissions" +#~ " (`Can read, clone, and push to " +#~ "this repository. Can also manage issues" +#~ " and pull requests`)." +#~ msgstr "" +#~ "维护者将被授予 [vllm-project/vllm-" +#~ "kunlun](https://github.com/vllm-project/vllm-kunlun) " +#~ "Github 仓库的写入权限(`可以读取、克隆和推送到此仓库。还可以管理问题和拉取请求`)。" + +#~ msgid "Nominating and Removing Maintainers" +#~ msgstr "提名和移除维护者" + +#~ msgid "The Principles" +#~ msgstr "原则" + +#~ msgid "" +#~ "Membership in vLLM Kunlun is given " +#~ "to individuals on merit basis after " +#~ "they demonstrated strong expertise of " +#~ "the vLLM / vLLM Kunlun through " +#~ "contributions, reviews and discussions." +#~ msgstr "" +#~ "vLLM Kunlun 的成员资格是基于个人能力授予的,只有在通过贡献、评审和讨论展示出对 vLLM" +#~ " / vLLM Kunlun 的深厚专业知识后,才可获得。" + +#~ msgid "" +#~ "For membership in the maintainer group" +#~ " the individual has to demonstrate " +#~ "strong and continued alignment with the" +#~ " overall vLLM / vLLM Kunlun " +#~ "principles." +#~ msgstr "要成为维护者组成员,个人必须表现出与 vLLM / vLLM Kunlun 总体原则的高度一致并持续支持。" + +#~ msgid "" +#~ "Light criteria of moving module " +#~ "maintenance to ‘emeritus’ status if they" +#~ " don’t actively participate over long " +#~ "periods of time." +#~ msgstr "如果模块维护人员在长时间内没有积极参与,可根据较宽松的标准将其维护状态转为“荣誉”状态。" + +#~ msgid "The membership is for an individual, not a company." +#~ msgstr "该会员资格属于个人,而非公司。" + +#~ msgid "Nomination and Removal" +#~ msgstr "提名与罢免" + +#~ msgid "" +#~ "Nomination: Anyone can nominate someone " +#~ "to become a maintainer (include self-" +#~ "nominate). All existing maintainers are " +#~ "responsible for evaluating the nomination. " +#~ "The nominator should provide nominee's " +#~ "info around the strength of the " +#~ "candidate to be a maintainer, include" +#~ " but not limited to review quality," +#~ " quality contribution, community involvement." +#~ msgstr "提名:任何人都可以提名他人成为维护者(包括自荐)。所有现有维护者都有责任评估提名。提名人应提供被提名人成为维护者的相关优势信息,包括但不限于评审质量、优质贡献、社区参与等。" + +#~ msgid "" +#~ "Removal: Anyone can nominate a person" +#~ " to be removed from maintainer " +#~ "position (include self-nominate). All " +#~ "existing maintainers are responsible for " +#~ "evaluating the nomination. The nominator " +#~ "should provide nominee's info, include " +#~ "but not limited to lack of " +#~ "activity, conflict with the overall " +#~ "direction and other information that " +#~ "makes them unfit to be a " +#~ "maintainer." +#~ msgstr "移除:任何人都可以提名某人被移出维护者职位(包括自荐)。所有现有维护者都有责任评估该提名。提名者应提供被提名人的相关信息,包括但不限于缺乏活动、与整体方向冲突以及使其不适合作为维护者的其他信息。" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/community/user_stories/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/community/user_stories/index.po new file mode 100644 index 0000000..ac99862 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/community/user_stories/index.po @@ -0,0 +1,120 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/community/user_stories/index.md:1 +#, fuzzy +msgid "User stories" +msgstr "用户故事" + +#~ msgid "More details" +#~ msgstr "更多细节" + +#~ msgid "" +#~ "Read case studies on how users and" +#~ " developers solves real, everyday problems" +#~ " with vLLM Kunlun" +#~ msgstr "阅读案例研究,了解用户和开发者如何使用 vLLM Kunlun 解决实际日常问题。" + +#~ msgid "" +#~ "[LLaMA-Factory](./llamafactory.md) is an " +#~ "easy-to-use and efficient platform " +#~ "for training and fine-tuning large " +#~ "language models, it supports vLLM Kunlun" +#~ " to speed up inference since " +#~ "[LLaMA-Factory#7739](https://github.com/hiyouga/LLaMA-" +#~ "Factory/pull/7739), gain 2x performance " +#~ "enhancement of inference." +#~ msgstr "" +#~ "[LLaMA-Factory](./llamafactory.md) " +#~ "是一个易于使用且高效的大语言模型训练与微调平台,自 [LLaMA-" +#~ "Factory#7739](https://github.com/hiyouga/LLaMA-" +#~ "Factory/pull/7739) 起支持 vLLM Kunlun 加速推理,推理性能提升" +#~ " 2 倍。" + +#~ msgid "" +#~ "[Huggingface/trl](https://github.com/huggingface/trl) is a" +#~ " cutting-edge library designed for " +#~ "post-training foundation models using " +#~ "advanced techniques like SFT, PPO and" +#~ " DPO, it uses vLLM Kunlun since " +#~ "[v0.17.0](https://github.com/huggingface/trl/releases/tag/v0.17.0) " +#~ "to support RLHF on Kunlun XPU." +#~ msgstr "" +#~ "[Huggingface/trl](https://github.com/huggingface/trl) " +#~ "是一个前沿的库,专为使用 SFT、PPO 和 DPO " +#~ "等先进技术对基础模型进行后训练而设计。从 " +#~ "[v0.17.0](https://github.com/huggingface/trl/releases/tag/v0.17.0) " +#~ "版本开始,该库利用 vLLM Kunlun 来支持在 Kunlun XPU" +#~ " 上进行 RLHF。" + +#~ msgid "" +#~ "[MindIE Turbo](https://pypi.org/project/mindie-turbo) " +#~ "is an LLM inference engine acceleration" +#~ " plug-in library developed by Baidu" +#~ " on Kunlun hardware, which includes " +#~ "self-developed large language model " +#~ "optimization algorithms and optimizations " +#~ "related to the inference engine " +#~ "framework. It supports vLLM Kunlun since" +#~ " " +#~ "[2.0rc1](https://www.hikunlun.com/document/detail/zh/mindie/20RC1/AcceleratePlugin/turbodev" +#~ "/mindie-turbo-0001.html)." +#~ msgstr "" +#~ "[MindIE Turbo](https://pypi.org/project/mindie-turbo) " +#~ "是华为在昇腾硬件上开发的一款用于加速LLM推理引擎的插件库,包含自主研发的大语言模型优化算法及与推理引擎框架相关的优化。从 " +#~ "[2.0rc1](https://www.hikunlun.com/document/detail/zh/mindie/20RC1/AcceleratePlugin/turbodev" +#~ "/mindie-turbo-0001.html) 起,支持 vLLM Kunlun。" + +#~ msgid "" +#~ "[GPUStack](https://github.com/gpustack/gpustack) is an " +#~ "open-source GPU cluster manager for " +#~ "running AI models. It supports vLLM " +#~ "Kunlun since " +#~ "[v0.6.2](https://github.com/gpustack/gpustack/releases/tag/v0.6.2)," +#~ " see more GPUStack performance evaluation" +#~ " info on " +#~ "[link](https://mp.weixin.qq.com/s/pkytJVjcH9_OnffnsFGaew)." +#~ msgstr "" +#~ "[GPUStack](https://github.com/gpustack/gpustack) 是一个开源的 " +#~ "GPU 集群管理器,用于运行 AI 模型。从 " +#~ "[v0.6.2](https://github.com/gpustack/gpustack/releases/tag/v0.6.2) " +#~ "版本开始支持 vLLM Kunlun,更多 GPUStack 性能评测信息见 " +#~ "[链接](https://mp.weixin.qq.com/s/pkytJVjcH9_OnffnsFGaew)。" + +#~ msgid "" +#~ "[verl](https://github.com/volcengine/verl) is a " +#~ "flexible, efficient and production-ready " +#~ "RL training library for large language" +#~ " models (LLMs), uses vLLM Kunlun " +#~ "since " +#~ "[v0.4.0](https://github.com/volcengine/verl/releases/tag/v0.4.0), " +#~ "see more info on [verl x Kunlun" +#~ " " +#~ "Quickstart](https://verl.readthedocs.io/en/latest/kunlun_tutorial/kunlun_quick_start.html)." +#~ msgstr "" +#~ "[verl](https://github.com/volcengine/verl) " +#~ "是一个灵活、高效且可用于生产环境的大型语言模型(LLM)强化学习训练库,自 " +#~ "[v0.4.0](https://github.com/volcengine/verl/releases/tag/v0.4.0) " +#~ "起支持 vLLM Kunlun,更多信息请参见 [verl x Kunlun" +#~ " " +#~ "快速上手](https://verl.readthedocs.io/en/latest/kunlun_tutorial/kunlun_quick_start.html)。" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/community/user_stories/llamafactory.po b/docs/source/locale/zh_CN/LC_MESSAGES/community/user_stories/llamafactory.po new file mode 100644 index 0000000..5b9a6fd --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/community/user_stories/llamafactory.po @@ -0,0 +1,108 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/community/user_stories/llamafactory.md:1 +msgid "LLaMA-Factory" +msgstr "LLaMA-Factory" + +#: ../../source/community/user_stories/llamafactory.md:3 +#, fuzzy +msgid "**Introduction**" +msgstr "**关于 / 介绍**" + +#: ../../source/community/user_stories/llamafactory.md:5 +msgid "" +"[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory) is an easy-to-" +"use and efficient platform for training and fine-tuning large language " +"models. With LLaMA-Factory, you can fine-tune hundreds of pre-trained " +"models locally without writing any code." +msgstr "" +"[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory) " +"是一个易于使用且高效的平台,用于训练和微调大型语言模型。有了 LLaMA-" +"Factory,你可以在本地对数百个预训练模型进行微调,无需编写任何代码。" + +#: ../../source/community/user_stories/llamafactory.md:7 +#, fuzzy +msgid "" +"LLaMA-Facotory users need to evaluate and inference the model after fine-" +"tuning." +msgstr "LLaMA-Facotory 用户需要在对模型进行微调后对模型进行评估和推理。" + +#: ../../source/community/user_stories/llamafactory.md:9 +#, fuzzy +msgid "**Business challenge**" +msgstr "**业务挑战**" + +#: ../../source/community/user_stories/llamafactory.md:11 +#, fuzzy +msgid "" +"LLaMA-Factory uses Transformers to perform inference on Kunlun XPUs, but " +"the speed is slow." +msgstr "LLaMA-Factory 使用 transformers 在 Kunlun XPU 上进行推理,但速度较慢。" + +#: ../../source/community/user_stories/llamafactory.md:13 +#, fuzzy +msgid "**Benefits with vLLM Kunlun**" +msgstr "**通过 vLLM Kunlun 解决挑战与收益**" + +#: ../../source/community/user_stories/llamafactory.md:15 +msgid "" +"With the joint efforts of LLaMA-Factory and vLLM Kunlun ([LLaMA-" +"Factory#7739](https://github.com/hiyouga/LLaMA-Factory/pull/7739)), " +"LLaMA-Factory has achieved significant performance gains during model " +"inference. Benchmark results show that its inference speed is now up to " +"2× faster compared to the Transformers implementation." +msgstr "" + +#: ../../source/community/user_stories/llamafactory.md:17 +msgid "**Learn more**" +msgstr "**了解更多**" + +#: ../../source/community/user_stories/llamafactory.md:19 +#, fuzzy +msgid "" +"See more details about LLaMA-Factory and how it uses vLLM Kunlun for " +"inference on Kunlun XPUs in [LLaMA-Factory Kunlun XPU " +"Inference](https://llamafactory.readthedocs.io/en/latest/advanced/npu_inference.html)." +msgstr "" +"在以下文档中查看更多关于 LLaMA-Factory 以及其如何在 Kunlun XPU 上使用 vLLM Kunlun 进行推理的信息" +":[LLaMA-Factory Kunlun XPU " +"推理](https://llamafactory.readthedocs.io/en/latest/advanced/npu_inference.html)。" + +#~ msgid "" +#~ "With the joint efforts of LLaMA-" +#~ "Factory and vLLM Kunlun ([LLaMA-" +#~ "Factory#7739](https://github.com/hiyouga/LLaMA-" +#~ "Factory/pull/7739)), the performance of " +#~ "LLaMA-Factory in the model inference " +#~ "stage has been significantly improved. " +#~ "According to the test results, the " +#~ "inference speed of LLaMA-Factory has " +#~ "been increased to 2x compared to " +#~ "the transformers version." +#~ msgstr "" +#~ "在 LLaMA-Factory 和 vLLM Kunlun " +#~ "的共同努力下(参见 [LLaMA-Factory#7739](https://github.com/hiyouga" +#~ "/LLaMA-Factory/pull/7739)),LLaMA-Factory " +#~ "在模型推理阶段的性能得到了显著提升。根据测试结果,LLaMA-Factory 的推理速度相比 " +#~ "transformers 版本提升到了 2 倍。" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/community/versioning_policy.po b/docs/source/locale/zh_CN/LC_MESSAGES/community/versioning_policy.po new file mode 100644 index 0000000..3be7486 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/community/versioning_policy.po @@ -0,0 +1,575 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/community/versioning_policy.md:1 +msgid "Versioning policy" +msgstr "版本管理策略" + +#~ msgid "" +#~ "Starting with vLLM 0.7.x, the vLLM " +#~ "Kunlun Plugin ([vllm-project/vllm-" +#~ "kunlun](https://github.com/vllm-project/vllm-kunlun)) " +#~ "project follows the [PEP " +#~ "440](https://peps.python.org/pep-0440/) to publish " +#~ "matching with vLLM ([vllm-" +#~ "project/vllm](https://github.com/vllm-project/vllm))." +#~ msgstr "" +#~ "从 vLLM 0.7.x 开始,vLLM Kunlun 插件([vllm-" +#~ "project/vllm-kunlun](https://github.com/vllm-project" +#~ "/vllm-kunlun))项目遵循 [PEP " +#~ "440](https://peps.python.org/pep-0440/) ,以与 vLLM([vllm-" +#~ "project/vllm](https://github.com/vllm-project/vllm))版本匹配发布。" + +#~ msgid "vLLM Kunlun Plugin versions" +#~ msgstr "vLLM Kunlun 插件版本" + +#~ msgid "" +#~ "Each vLLM Kunlun release will be " +#~ "versioned: `v[major].[minor].[micro][rcN][.postN]` (such" +#~ " as `v0.7.3rc1`, `v0.7.3`, `v0.7.3.post1`)" +#~ msgstr "" +#~ "每个 vLLM Kunlun " +#~ "版本将采用以下版本格式:`v[major].[minor].[micro][rcN][.postN]`(例如 " +#~ "`v0.7.3rc1`、`v0.7.3`、`v0.7.3.post1`)" + +#~ msgid "" +#~ "**Final releases**: will typically be " +#~ "released every **3 months**, will take" +#~ " the vLLM upstream release plan and" +#~ " Kunlun software product release plan " +#~ "into comprehensive consideration." +#~ msgstr "**正式版本**:通常每**3个月**发布一次,将综合考虑 vLLM 上游发行计划和昇腾软件产品发行计划。" + +#~ msgid "" +#~ "**Pre releases**: will typically be " +#~ "released **on demand**, ending with rcN," +#~ " represents the Nth release candidate " +#~ "version, to support early testing by " +#~ "our users prior to a final " +#~ "release." +#~ msgstr "**预发布版本**:通常会**按需发布**,以 rcN 结尾,表示第N个候选发布版本,旨在支持用户在正式发布前进行早期测试。" + +#~ msgid "" +#~ "**Post releases**: will typically be " +#~ "released **on demand** to support to " +#~ "address minor errors in a final " +#~ "release. It's different from [PEP-440 " +#~ "post release note](https://peps.python.org/pep-0440" +#~ "/#post-releases) suggestion, it will " +#~ "contain actual bug fixes considering " +#~ "that the final release version should" +#~ " be matched strictly with the vLLM" +#~ " final release version " +#~ "(`v[major].[minor].[micro]`). The post version " +#~ "has to be published as a patch " +#~ "version of the final release." +#~ msgstr "" +#~ "**后续版本**:通常会根据需要发布,以支持解决正式发布中的小错误。这与 [PEP-440 " +#~ "的后续版本说明](https://peps.python.org/pep-0440/#post-releases) " +#~ "建议不同,它将包含实际的 bug 修复,因为最终发布版本应严格与 vLLM " +#~ "的最终发布版本(`v[major].[minor].[micro]`)匹配。后续版本必须以正式发布的补丁版本形式发布。" + +#~ msgid "For example:" +#~ msgstr "例如:" + +#~ msgid "" +#~ "`v0.7.x`: it's the first final release" +#~ " to match the vLLM `v0.7.x` version." +#~ msgstr "`v0.7.x`:这是第一个与 vLLM `v0.7.x` 版本相匹配的正式发布版本。" + +#~ msgid "`v0.7.3rc1`: will be the first pre version of vLLM Kunlun." +#~ msgstr "`v0.7.3rc1`:将会是 vLLM Kunlun 的第一个预发布版本。" + +#~ msgid "" +#~ "`v0.7.3.post1`: will be the post release" +#~ " if the `v0.7.3` release has some " +#~ "minor errors." +#~ msgstr "`v0.7.3.post1`:如果 `v0.7.3` 版本发布有一些小错误,将作为后续修正版发布。" + +#~ msgid "Release Compatibility Matrix" +#~ msgstr "版本兼容性矩阵" + +#~ msgid "Following is the Release Compatibility Matrix for vLLM Kunlun Plugin:" +#~ msgstr "以下是 vLLM Kunlun 插件的版本兼容性矩阵:" + +#~ msgid "vLLM Kunlun" +#~ msgstr "vLLM Kunlun" + +#~ msgid "vLLM" +#~ msgstr "vLLM" + +#~ msgid "Python" +#~ msgstr "Python" + +#~ msgid "Stable CANN" +#~ msgstr "Stable CANN" + +#~ msgid "PyTorch/torch_npu" +#~ msgstr "PyTorch/torch_npu" + +#~ msgid "MindIE Turbo" +#~ msgstr "MindIE Turbo" + +#~ msgid "v0.9.2rc1" +#~ msgstr "v0.9.2rc1" + +#~ msgid "v0.9.2" +#~ msgstr "v0.9.2" + +#~ msgid ">= 3.9, < 3.12" +#~ msgstr ">= 3.9,< 3.12" + +#~ msgid "8.1.RC1" +#~ msgstr "8.1.RC1" + +#~ msgid "2.5.1 / 2.5.1.post1.dev20250619" +#~ msgstr "2.5.1 / 2.5.1.post1.dev20250619" + +#~ msgid "v0.9.1rc1" +#~ msgstr "v0.9.1rc1" + +#~ msgid "v0.9.1" +#~ msgstr "v0.9.1" + +#~ msgid "2.5.1 / 2.5.1.post1.dev20250528" +#~ msgstr "2.5.1 / 2.5.1.post1.dev20250528" + +#~ msgid "v0.9.0rc2" +#~ msgstr "v0.9.0rc2" + +#~ msgid "v0.9.0" +#~ msgstr "v0.9.0" + +#~ msgid "2.5.1 / 2.5.1" +#~ msgstr "2.5.1 / 2.5.1" + +#~ msgid "v0.9.0rc1" +#~ msgstr "v0.9.0rc1" + +#~ msgid "v0.8.5rc1" +#~ msgstr "v0.8.5rc1" + +#~ msgid "v0.8.5.post1" +#~ msgstr "v0.8.5.post1" + +#~ msgid "v0.8.4rc2" +#~ msgstr "v0.8.4rc2" + +#~ msgid "v0.8.4" +#~ msgstr "v0.8.4" + +#~ msgid "8.0.0" +#~ msgstr "8.0.0" + +#~ msgid "v0.7.3.post1" +#~ msgstr "v0.7.3.post1" + +#~ msgid "v0.7.3" +#~ msgstr "v0.7.3" + +#~ msgid "2.0rc1" +#~ msgstr "2.0候选版本1" + +#~ msgid "Release cadence" +#~ msgstr "发布节奏" + +#~ msgid "release window" +#~ msgstr "发布窗口" + +#~ msgid "Date" +#~ msgstr "日期" + +#~ msgid "Event" +#~ msgstr "事件" + +#~ msgid "2025.07.11" +#~ msgstr "2025.07.11" + +#~ msgid "Release candidates, v0.9.2rc1" +#~ msgstr "候选发布版本,v0.9.2rc1" + +#~ msgid "2025.06.22" +#~ msgstr "2025.06.22" + +#~ msgid "Release candidates, v0.9.1rc1" +#~ msgstr "候选发布版本,v0.9.1rc1" + +#~ msgid "2025.06.10" +#~ msgstr "2025.06.10" + +#~ msgid "Release candidates, v0.9.0rc2" +#~ msgstr "候选发布版本,v0.9.0rc2" + +#~ msgid "2025.06.09" +#~ msgstr "2025.06.09" + +#~ msgid "Release candidates, v0.9.0rc1" +#~ msgstr "候选发布版本本,v0.9.0rc1" + +#~ msgid "2025.05.29" +#~ msgstr "2025.05.29" + +#~ msgid "v0.7.x post release, v0.7.3.post1" +#~ msgstr "v0.7.x 补丁版,v0.7.3.post1" + +#~ msgid "2025.05.08" +#~ msgstr "2025.05.08" + +#~ msgid "v0.7.x Final release, v0.7.3" +#~ msgstr "v0.7.x 正式版,v0.7.3" + +#~ msgid "2025.05.06" +#~ msgstr "2025.05.06" + +#~ msgid "Release candidates, v0.8.5rc1" +#~ msgstr "候选发布版本,v0.8.5rc1" + +#~ msgid "2025.04.28" +#~ msgstr "2025.04.28" + +#~ msgid "Release candidates, v0.8.4rc2" +#~ msgstr "候选发布版本,v0.8.4rc2" + +#~ msgid "2025.04.18" +#~ msgstr "2025.04.18" + +#~ msgid "Release candidates, v0.8.4rc1" +#~ msgstr "候选发布版本,v0.8.4rc1" + +#~ msgid "2025.03.28" +#~ msgstr "2025.03.28" + +#~ msgid "Release candidates, v0.7.3rc2" +#~ msgstr "候选发布版本,v0.7.3rc2" + +#~ msgid "2025.03.14" +#~ msgstr "2025.03.14" + +#~ msgid "Release candidates, v0.7.3rc1" +#~ msgstr "候选发布版本,v0.7.3rc1" + +#~ msgid "2025.02.19" +#~ msgstr "2025.02.19" + +#~ msgid "Release candidates, v0.7.1rc1" +#~ msgstr "候选发布版本,v0.7.1rc1" + +#~ msgid "Branch policy" +#~ msgstr "分支策略" + +#~ msgid "vLLM Kunlun has main branch and dev branch." +#~ msgstr "vLLM Kunlun 有主分支和开发分支。" + +#~ msgid "" +#~ "**main**: main branch,corresponds to the " +#~ "vLLM main branch and latest 1 or" +#~ " 2 release version. It is " +#~ "continuously monitored for quality through " +#~ "Kunlun CI." +#~ msgstr "**main**:main 分支,对应 vLLM 的主分支和最新的 1 或 2 个发布版本。该分支通过 Kunlun CI 持续监控质量。" + +#~ msgid "" +#~ "**vX.Y.Z-dev**: development branch, created " +#~ "with part of new releases of vLLM." +#~ " For example, `v0.7.3-dev` is the dev" +#~ " branch for vLLM `v0.7.3` version." +#~ msgstr "" +#~ "**vX.Y.Z-dev**:开发分支,是随着 vLLM 新版本的一部分一起创建的。例如,`v0.7.3-dev`" +#~ " 是 vLLM `v0.7.3` 版本的开发分支。" + +#~ msgid "" +#~ "Usually, a commit should be ONLY " +#~ "first merged in the main branch, " +#~ "and then backported to the dev " +#~ "branch to reduce maintenance costs as" +#~ " much as possible." +#~ msgstr "通常,提交应该只先合并到主分支,然后再回溯合并到开发分支,以尽可能降低维护成本。" + +#~ msgid "Maintenance branch and EOL:" +#~ msgstr "维护分支与生命周期结束(EOL):" + +#~ msgid "The branch status will be in one of the following states:" +#~ msgstr "分支状态将处于以下几种状态之一:" + +#~ msgid "Branch" +#~ msgstr "分支" + +#~ msgid "Time frame" +#~ msgstr "时间范围" + +#~ msgid "Summary" +#~ msgstr "摘要" + +#~ msgid "Maintained" +#~ msgstr "维护中" + +#~ msgid "Approximately 2-3 minor versions" +#~ msgstr "大约 2-3 个小版本" + +#~ msgid "All bugfixes are appropriate. Releases produced, CI commitment." +#~ msgstr "所有的错误修复都是合适的。正常发布版本,持续集成承诺。" + +#~ msgid "Unmaintained" +#~ msgstr "无人维护" + +#~ msgid "Community interest driven" +#~ msgstr "社区兴趣驱动" + +#~ msgid "All bugfixes are appropriate. No Releases produced, No CI commitment" +#~ msgstr "所有的 bug 修复都是合适的。没有发布版本,不承诺持续集成(CI)。" + +#~ msgid "End of Life (EOL)" +#~ msgstr "生命周期结束(EOL)" + +#~ msgid "N/A" +#~ msgstr "不适用" + +#~ msgid "Branch no longer accepting changes" +#~ msgstr "该分支不再接受更改" + +#~ msgid "Branch state" +#~ msgstr "分支状态" + +#~ msgid "" +#~ "Note that vLLM Kunlun will only be" +#~ " released for a certain vLLM release" +#~ " version rather than all versions. " +#~ "Hence, You might see only part of" +#~ " versions have dev branches (such as" +#~ " only `0.7.1-dev` / `0.7.3-dev` but " +#~ "no `0.7.2-dev`), this is as expected." +#~ msgstr "" +#~ "请注意,vLLM Kunlun 只会针对某些 vLLM " +#~ "发布版本发布,而不是所有版本。因此,您可能会看到只有部分版本拥有开发分支(例如只有 `0.7.1-dev` /" +#~ " `0.7.3-dev`,而没有 `0.7.2-dev`),这是正常现象。" + +#~ msgid "" +#~ "Usually, each minor version of vLLM " +#~ "(such as 0.7) will correspond to a" +#~ " vLLM Kunlun version branch and " +#~ "support its latest version (for example," +#~ " we plan to support version 0.7.3)" +#~ " as following shown:" +#~ msgstr "" +#~ "通常,vLLM 的每一个小版本(例如 0.7)都会对应一个 vLLM Kunlun " +#~ "版本分支,并支持其最新版本(例如,我们计划支持 0.7.3 版),如下所示:" + +#~ msgid "Status" +#~ msgstr "状态" + +#~ msgid "Note" +#~ msgstr "注释" + +#~ msgid "main" +#~ msgstr "main" + +#~ msgid "CI commitment for vLLM main branch and vLLM 0.9.2 branch" +#~ msgstr "vLLM 主分支和 vLLM 0.9.2 分支的 CI 承诺" + +#~ msgid "v0.9.1-dev" +#~ msgstr "v0.9.1-dev" + +#~ msgid "CI commitment for vLLM 0.9.1 version" +#~ msgstr "vLLM 0.9.1 版本的 CI 承诺" + +#~ msgid "v0.7.3-dev" +#~ msgstr "v0.7.3-dev" + +#~ msgid "CI commitment for vLLM 0.7.3 version" +#~ msgstr "vLLM 0.7.3 版本的 CI 承诺" + +#~ msgid "v0.7.1-dev" +#~ msgstr "v0.7.1-dev" + +#~ msgid "Replaced by v0.7.3-dev" +#~ msgstr "已被 v0.7.3-dev 替代" + +#~ msgid "Backward compatibility" +#~ msgstr "向后兼容性" + +#~ msgid "" +#~ "For main branch, vLLM Kunlun should " +#~ "works with vLLM main branch and " +#~ "latest 1 or 2 release version. So" +#~ " to ensure the backward compatibility, " +#~ "we will do the following:" +#~ msgstr "" +#~ "对于主分支,vLLM Kunlun 应该与 vLLM 主分支以及最新的 1" +#~ " 或 2 个发布版本兼容。因此,为了确保向后兼容性,我们将执行以下操作:" + +#~ msgid "" +#~ "Both main branch and target vLLM " +#~ "release is tested by Kunlun E2E " +#~ "CI. For example, currently, vLLM main" +#~ " branch and vLLM 0.8.4 are tested " +#~ "now." +#~ msgstr "主分支和目标 vLLM 发行版都经过了 Kunlun E2E CI 的测试。例如,目前正在测试 vLLM 主分支和 vLLM 0.8.4。" + +#~ msgid "" +#~ "For code changes, we will make " +#~ "sure that the changes are compatible " +#~ "with the latest 1 or 2 vLLM " +#~ "release version as well. In this " +#~ "case, vLLM Kunlun introduced a version" +#~ " check machinism inner the code. " +#~ "It'll check the version of installed " +#~ "vLLM package first to decide which " +#~ "code logic to use. If users hit" +#~ " the `InvalidVersion` error, it sometimes" +#~ " means that they have installed an" +#~ " dev/editable version of vLLM package. " +#~ "In this case, we provide the env" +#~ " variable `VLLM_VERSION` to let users " +#~ "specify the version of vLLM package " +#~ "to use." +#~ msgstr "" +#~ "对于代码更改,我们也会确保这些更改与最新的 1 或 2 个 vLLM " +#~ "发行版本兼容。在这种情况下,vLLM Kunlun 在代码中引入了版本检查机制。它会先检查已安装的 " +#~ "vLLM 包的版本,然后决定使用哪段代码逻辑。如果用户遇到 `InvalidVersion` " +#~ "错误,这有时意味着他们安装了 dev/可编辑版本的 vLLM 包。此时,我们提供了环境变量 " +#~ "`VLLM_VERSION`,让用户可以指定要使用的 vLLM 包版本。" + +#~ msgid "" +#~ "For documentation changes, we will make" +#~ " sure that the changes are compatible" +#~ " with the latest 1 or 2 vLLM" +#~ " release version as well. Note should" +#~ " be added if there are any " +#~ "breaking changes." +#~ msgstr "对于文档更改,我们会确保这些更改也兼容于最新的1个或2个 vLLM 发布版本。如果有任何重大变更,应添加说明。" + +#~ msgid "Document Branch Policy" +#~ msgstr "文档分支政策" + +#~ msgid "" +#~ "To reduce maintenance costs, **all " +#~ "branch documentation content should remain " +#~ "consistent, and version differences can " +#~ "be controlled via variables in " +#~ "[docs/source/conf.py](https://github.com/vllm-project/vllm-" +#~ "kunlun/blob/main/docs/source/conf.py)**. While this " +#~ "is not a simple task, it is " +#~ "a principle we should strive to " +#~ "follow." +#~ msgstr "" +#~ "为了减少维护成本,**所有分支的文档内容应保持一致,版本差异可以通过 " +#~ "[docs/source/conf.py](https://github.com/vllm-project/vllm-" +#~ "kunlun/blob/main/docs/source/conf.py) " +#~ "中的变量进行控制**。虽然这并非易事,但这是我们应当努力遵循的原则。" + +#~ msgid "Version" +#~ msgstr "版本" + +#~ msgid "Purpose" +#~ msgstr "用途" + +#~ msgid "Code Branch" +#~ msgstr "代码分支" + +#~ msgid "latest" +#~ msgstr "最新" + +#~ msgid "Doc for the latest dev branch" +#~ msgstr "最新开发分支的文档" + +#~ msgid "vX.Y.Z-dev (Will be `main` after the first final release)" +#~ msgstr "vX.Y.Z-dev(在第一个正式版本发布后将成为 `main`)" + +#~ msgid "version" +#~ msgstr "版本" + +#~ msgid "Doc for historical released versions" +#~ msgstr "历史版本文档" + +#~ msgid "Git tags, like vX.Y.Z[rcN]" +#~ msgstr "Git 标签,如 vX.Y.Z[rcN]" + +#~ msgid "stable(not yet released)" +#~ msgstr "稳定版(尚未发布)" + +#~ msgid "Doc for latest final release branch" +#~ msgstr "最新正式发布分支的文档" + +#~ msgid "Will be `vX.Y.Z-dev` after the first official release" +#~ msgstr "首个正式发布后将会是 `vX.Y.Z-dev`" + +#~ msgid "As shown above:" +#~ msgstr "如上所示:" + +#~ msgid "" +#~ "`latest` documentation: Matches the current" +#~ " maintenance branch `vX.Y.Z-dev` (Will be" +#~ " `main` after the first final " +#~ "release). Continuously updated to ensure " +#~ "usability for the latest release." +#~ msgstr "`latest` 文档:匹配当前维护分支 `vX.Y.Z-dev`(在首次正式发布后将为 `main`)。持续更新,以确保适用于最新发布版本。" + +#~ msgid "" +#~ "`version` documentation: Corresponds to " +#~ "specific released versions (e.g., `v0.7.3`," +#~ " `v0.7.3rc1`). No further updates after " +#~ "release." +#~ msgstr "`version` 文档:对应特定的已发布版本(例如,`v0.7.3`、`v0.7.3rc1`)。发布后不再进行更新。" + +#~ msgid "" +#~ "`stable` documentation (**not yet released**):" +#~ " Official release documentation. Updates " +#~ "are allowed in real-time after " +#~ "release, typically based on vX.Y.Z-dev. " +#~ "Once stable documentation is available, " +#~ "non-stable versions should display a " +#~ "header warning: `You are viewing the " +#~ "latest developer preview docs. Click " +#~ "here to view docs for the latest" +#~ " stable release.`." +#~ msgstr "" +#~ "`stable` 文档(**尚未发布**):官方发布版文档。发布后允许实时更新,通常基于 " +#~ "vX.Y.Z-dev。一旦稳定版文档可用,非稳定版本应显示一个顶部警告:`您正在查看最新的开发预览文档。点击此处查看最新稳定版本文档。`" + +#~ msgid "Software Dependency Management" +#~ msgstr "软件依赖管理" + +#~ msgid "" +#~ "`torch-xpu`: Kunlun Extension for " +#~ "PyTorch (torch-xpu) releases a stable" +#~ " version to [PyPi](https://pypi.org/project/torch-" +#~ "xpu) every 3 months, a development " +#~ "version (aka the POC version) every " +#~ "month, and a nightly version every " +#~ "day. The PyPi stable version **CAN** " +#~ "be used in vLLM Kunlun final " +#~ "version, the monthly dev version **ONLY" +#~ " CANN** be used in vLLM Kunlun " +#~ "RC version for rapid iteration, the " +#~ "nightly version **CANNOT** be used in" +#~ " vLLM Kunlun any version and " +#~ "branches." +#~ msgstr "" +#~ "`torch-xpu`:Kunlun Extension for PyTorch" +#~ "(torch-xpu)每 3 个月会在 " +#~ "[PyPi](https://pypi.org/project/torch-xpu) " +#~ "上发布一个稳定版本,每个月发布一个开发版本(即 POC 版本),每天发布一个 nightly " +#~ "版本。PyPi 上的稳定版本**可以**用于 vLLM Kunlun " +#~ "的正式版本,月度开发版本**只能**用于 vLLM Kunlun 的 " +#~ "RC(候选发布)版本以便快速迭代,nightly 版本**不能**用于 vLLM Kunlun " +#~ "的任何版本和分支。" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contribution/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contribution/index.po new file mode 100644 index 0000000..cebbfc9 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contribution/index.po @@ -0,0 +1,177 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/developer_guide/contribution/index.md:1 +msgid "Contributing" +msgstr "贡献" + +#: ../../source/developer_guide/contribution/index.md:3 +#, fuzzy +msgid "Building and Testing" +msgstr "构建与测试" + +#~ msgid "Index" +#~ msgstr "索引" + +#~ msgid "" +#~ "It's recommended to set up a local" +#~ " development environment to build and " +#~ "test before you submit a PR." +#~ msgstr "建议先搭建本地开发环境来进行构建和测试,再提交 PR。" + +#~ msgid "Setup development environment" +#~ msgstr "搭建开发环境" + +#~ msgid "" +#~ "Theoretically, the vllm-kunlun build is" +#~ " only supported on Linux because " +#~ "`vllm-kunlun` dependency `torch_npu` only " +#~ "supports Linux." +#~ msgstr "" +#~ "理论上,vllm-kunlun 构建仅支持 Linux,因为 `vllm-" +#~ "kunlun` 的依赖项 `torch_npu` 只支持 Linux。" + +#~ msgid "" +#~ "But you can still set up dev " +#~ "env on Linux/Windows/macOS for linting " +#~ "and basic test as following commands:" +#~ msgstr "但你仍然可以在 Linux/Windows/macOS 上按照以下命令设置开发环境,用于代码规约检查和基本测试:" + +#~ msgid "Run lint locally" +#~ msgstr "在本地运行 lint" + +#~ msgid "Run CI locally" +#~ msgstr "本地运行CI" + +#~ msgid "After complete \"Run lint\" setup, you can run CI locally:" +#~ msgstr "在完成“运行 lint”设置后,你可以在本地运行 CI:" + +#~ msgid "Submit the commit" +#~ msgstr "提交该提交" + +#~ msgid "" +#~ "🎉 Congratulations! You have completed " +#~ "the development environment setup." +#~ msgstr "🎉 恭喜!你已经完成了开发环境的搭建。" + +#~ msgid "Test locally" +#~ msgstr "本地测试" + +#~ msgid "" +#~ "You can refer to [Testing](./testing.md) " +#~ "doc to help you setup testing " +#~ "environment and running tests locally." +#~ msgstr "你可以参考 [测试](./testing.md) 文档,帮助你搭建测试环境并在本地运行测试。" + +#~ msgid "DCO and Signed-off-by" +#~ msgstr "DCO 和签名确认" + +#~ msgid "" +#~ "When contributing changes to this " +#~ "project, you must agree to the " +#~ "DCO. Commits must include a `Signed-" +#~ "off-by:` header which certifies " +#~ "agreement with the terms of the " +#~ "DCO." +#~ msgstr "当为本项目贡献更改时,您必须同意 DCO。提交必须包含 `Signed-off-by:` 头部,以证明您同意 DCO 的条款。" + +#~ msgid "Using `-s` with `git commit` will automatically add this header." +#~ msgstr "在使用 `git commit` 时加上 `-s` 参数会自动添加这个头部信息。" + +#~ msgid "PR Title and Classification" +#~ msgstr "PR 标题与分类" + +#~ msgid "" +#~ "Only specific types of PRs will be" +#~ " reviewed. The PR title is prefixed" +#~ " appropriately to indicate the type " +#~ "of change. Please use one of the" +#~ " following:" +#~ msgstr "只有特定类型的 PR 会被审核。PR 标题应使用合适的前缀以指明更改类型。请使用以下之一:" + +#~ msgid "`[Attention]` for new features or optimization in attention." +#~ msgstr "`[Attention]` 用于注意力机制中新特性或优化。" + +#~ msgid "`[Communicator]` for new features or optimization in communicators." +#~ msgstr "`[Communicator]` 适用于通信器中的新特性或优化。" + +#~ msgid "`[ModelRunner]` for new features or optimization in model runner." +#~ msgstr "`[ModelRunner]` 用于模型运行器中的新功能或优化。" + +#~ msgid "`[Platform]` for new features or optimization in platform." +#~ msgstr "`[Platform]` 用于平台中新功能或优化。" + +#~ msgid "`[Worker]` for new features or optimization in worker." +#~ msgstr "`[Worker]` 用于 worker 的新功能或优化。" + +#~ msgid "" +#~ "`[Core]` for new features or " +#~ "optimization in the core vllm-kunlun" +#~ " logic (such as platform, attention, " +#~ "communicators, model runner)" +#~ msgstr "`[Core]` 用于核心 vllm-kunlun 逻辑中的新特性或优化(例如平台、注意力机制、通信器、模型运行器)。" + +#~ msgid "`[Kernel]` changes affecting compute kernels and ops." +#~ msgstr "`[Kernel]` 影响计算内核和操作的更改。" + +#~ msgid "`[Bugfix]` for bug fixes." +#~ msgstr "`[Bugfix]` 用于表示错误修复。" + +#~ msgid "`[Doc]` for documentation fixes and improvements." +#~ msgstr "`[Doc]` 用于文档修复和改进。" + +#~ msgid "`[Test]` for tests (such as unit tests)." +#~ msgstr "`[Test]` 用于测试(如单元测试)。" + +#~ msgid "`[CI]` for build or continuous integration improvements." +#~ msgstr "`[CI]` 用于构建或持续集成的改进。" + +#~ msgid "" +#~ "`[Misc]` for PRs that do not fit" +#~ " the above categories. Please use " +#~ "this sparingly." +#~ msgstr "对于不属于上述类别的 PR,请使用 `[Misc]`。请谨慎使用此标签。" + +#~ msgid "" +#~ "If the PR spans more than one " +#~ "category, please include all relevant " +#~ "prefixes." +#~ msgstr "如果拉取请求(PR)涵盖多个类别,请包含所有相关的前缀。" + +#~ msgid "Others" +#~ msgstr "其他" + +#~ msgid "" +#~ "You may find more information about " +#~ "contributing to vLLM Kunlun backend " +#~ "plugin on " +#~ "[docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html)." +#~ " If you find any problem when " +#~ "contributing, you can feel free to " +#~ "submit a PR to improve the doc " +#~ "to help other developers." +#~ msgstr "" +#~ "你可以在 " +#~ "[docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html)" +#~ " 上找到有关为 vLLM Kunlun " +#~ "后端插件做贡献的更多信息。如果你在贡献过程中遇到任何问题,欢迎随时提交 PR 来改进文档,以帮助其他开发者。" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contribution/multi_node_test.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contribution/multi_node_test.po new file mode 100644 index 0000000..ff876a6 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contribution/multi_node_test.po @@ -0,0 +1,133 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun \n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/developer_guide/contribution/multi_node_test.md:1 +msgid "Multi Node Test" +msgstr "" + +#: ../../source/developer_guide/contribution/multi_node_test.md:3 +msgid "" +"Multi-Node CI is designed to test distributed scenarios of very large " +"models, eg: disaggregated_prefill multi DP across multi nodes and so on." +msgstr "" + +#: ../../source/developer_guide/contribution/multi_node_test.md:5 +msgid "How is works" +msgstr "" + +#: ../../source/developer_guide/contribution/multi_node_test.md:7 +msgid "" +"The following picture shows the basic deployment view of the multi-node " +"CI mechanism, It shows how the github action interact with " +"[lws](https://lws.sigs.k8s.io/docs/overview/) (a kind of kubernetes crd " +"resource)" +msgstr "" + +#: ../../source/developer_guide/contribution/multi_node_test.md:9 +msgid "![alt text](../../assets/deployment.png)" +msgstr "" + +#: ../../source/developer_guide/contribution/multi_node_test.md:9 +#: ../../source/developer_guide/contribution/multi_node_test.md:13 +msgid "alt text" +msgstr "" + +#: ../../source/developer_guide/contribution/multi_node_test.md:11 +msgid "" +"From the workflow perspective, we can see how the final test script is " +"executed, The key point is that these two [lws.yaml and " +"run.sh](https://github.com/vllm-project/vllm-" +"kunlun/tree/main/tests/e2e/nightly/multi_node/scripts), The former " +"defines how our k8s cluster is pulled up, and the latter defines the " +"entry script when the pod is started, Each node executes different logic " +"according to the " +"[LWS_WORKER_INDEX](https://lws.sigs.k8s.io/docs/reference/labels-" +"annotations-and-environment-variables/) environment variable, so that " +"multiple nodes can form a distributed cluster to perform tasks." +msgstr "" + +#: ../../source/developer_guide/contribution/multi_node_test.md:13 +msgid "![alt text](../../assets/workflow.png)" +msgstr "" + +#: ../../source/developer_guide/contribution/multi_node_test.md:15 +msgid "How to contribute" +msgstr "" + +#: ../../source/developer_guide/contribution/multi_node_test.md:17 +msgid "Upload custom weights" +msgstr "" + +#: ../../source/developer_guide/contribution/multi_node_test.md:19 +msgid "" +"If you need customized weights, for example, you quantized a w8a8 weight " +"for DeepSeek-V3 and you want your weight to run on CI, Uploading weights " +"to ModelScope's [vllm-kunlun](https://www.modelscope.cn/organization" +"/vllm-kunlun) organization is welcome, If you do not have permission to " +"upload, please contact @Potabk" +msgstr "" + +#: ../../source/developer_guide/contribution/multi_node_test.md:21 +msgid "Add config yaml" +msgstr "" + +#: ../../source/developer_guide/contribution/multi_node_test.md:23 +msgid "" +"As the entrypoint script [run.sh](https://github.com/vllm-project/vllm-" +"kunlun/blob/0bf3f21a987aede366ec4629ad0ffec8e32fe90d/tests/e2e/nightly/multi_node/scripts/run.sh#L106)" +" shows, A k8s pod startup means traversing all *.yaml files in the " +"[directory](https://github.com/vllm-project/vllm-" +"kunlun/tree/main/tests/e2e/nightly/multi_node/config/models), reading and" +" executing according to different configurations, so what we need to do " +"is just add \"yamls\" like [DeepSeek-V3.yaml](https://github.com/vllm-" +"project/vllm-" +"kunlun/blob/main/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml)." +msgstr "" + +#: ../../source/developer_guide/contribution/multi_node_test.md:25 +msgid "" +"Suppose you have **2 nodes** running a 1P1D setup (1 Prefillers + 1 " +"Decoder):" +msgstr "" + +#: ../../source/developer_guide/contribution/multi_node_test.md:27 +msgid "you may add a config file looks like:" +msgstr "" + +#: ../../source/developer_guide/contribution/multi_node_test.md:69 +msgid "" +"Add the case to nightly workflow currently, the multi-node test workflow " +"defined in the [vllm_kunlun_test_nightly_a2/a3.yaml](https://github.com" +"/vllm-project/vllm-" +"kunlun/blob/main/.github/workflows/vllm_kunlun_test_nightly_a3.yaml)" +msgstr "" + +#: ../../source/developer_guide/contribution/multi_node_test.md:99 +msgid "" +"The matrix above defines all the parameters required to add a multi-" +"machine use case, The parameters worth paying attention to (I mean if you" +" are adding a new use case) are size and the path to the yaml " +"configuration file. The former defines the number of nodes required for " +"your use case, and the latter defines the path to the configuration file " +"you have completed in step 2." +msgstr "" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contribution/testing.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contribution/testing.po new file mode 100644 index 0000000..94ebdc4 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contribution/testing.po @@ -0,0 +1,265 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/developer_guide/contribution/testing.md:1 +msgid "Testing" +msgstr "测试" + +#: ../../source/developer_guide/contribution/testing.md:3 +#, fuzzy +msgid "" +"This document explains how to write E2E tests and unit tests to verify " +"the implementation of your feature." +msgstr "本节介绍如何编写端到端测试和单元测试,以验证你的功能实现。" + +#: ../../source/developer_guide/contribution/testing.md:5 +#, fuzzy +msgid "Setup a test environment" +msgstr "设置测试环境" + +#: ../../source/developer_guide/contribution/testing.md:7 +#, fuzzy +msgid "" +"The fastest way to setup a test environment is to use the main branch's " +"container image:" +msgstr "搭建测试环境最快的方法是使用 main 分支的容器镜像:" + +#: ../../source/developer_guide/contribution/testing.md +msgid "Local (CPU)" +msgstr "本地(CPU)" + +#: ../../source/developer_guide/contribution/testing.md:18 +#, fuzzy +msgid "You can run the unit tests on CPUs with the following steps:" +msgstr "你可以按照以下步骤在 CPU 上运行单元测试:" + +#: ../../source/developer_guide/contribution/testing.md +msgid "Single card" +msgstr "单张卡片" + +#: ../../source/developer_guide/contribution/testing.md:86 +#: ../../source/developer_guide/contribution/testing.md:125 +msgid "After starting the container, you should install the required packages:" +msgstr "启动容器后,你应该安装所需的软件包:" + +#: ../../source/developer_guide/contribution/testing.md +msgid "Multi cards" +msgstr "多卡" + +#: ../../source/developer_guide/contribution/testing.md:139 +msgid "Running tests" +msgstr "运行测试" + +#: ../../source/developer_guide/contribution/testing.md:141 +#, fuzzy +msgid "Unit tests" +msgstr "单元测试" + +#: ../../source/developer_guide/contribution/testing.md:143 +msgid "There are several principles to follow when writing unit tests:" +msgstr "编写单元测试时需要遵循几个原则:" + +#: ../../source/developer_guide/contribution/testing.md:145 +#, fuzzy +msgid "" +"The test file path should be consistent with the source file and start " +"with the `test_` prefix, such as: `vllm_kunlun/worker/worker_v1.py` --> " +"`tests/ut/worker/test_worker_v1.py`" +msgstr "" +"测试文件的路径应与源文件保持一致,并以 `test_` 前缀开头,例如:`vllm_kunlun/worker/worker_v1.py` -->" +" `tests/ut/worker/test_worker_v1.py`" + +#: ../../source/developer_guide/contribution/testing.md:146 +#, fuzzy +msgid "" +"The vLLM Kunlun test uses unittest framework. See " +"[here](https://docs.python.org/3/library/unittest.html#module-unittest) " +"to understand how to write unit tests." +msgstr "" +"vLLM Kunlun 测试使用 unittest " +"框架,参见[这里](https://docs.python.org/3/library/unittest.html#module-" +"unittest)了解如何编写单元测试。" + +#: ../../source/developer_guide/contribution/testing.md:147 +#, fuzzy +msgid "" +"All unit tests can be run on CPUs, so you must mock the device-related " +"function to host." +msgstr "所有单元测试都可以在 CPU 上运行,因此你必须将与设备相关的函数模拟为 host。" + +#: ../../source/developer_guide/contribution/testing.md:148 +msgid "" +"Example: [tests/ut/test_kunlun_config.py](https://github.com/vllm-project" +"/vllm-kunlun/blob/main/tests/ut/test_kunlun_config.py)." +msgstr "" +"示例:[tests/ut/test_kunlun_config.py](https://github.com/vllm-project/vllm-" +"kunlun/blob/main/tests/ut/test_kunlun_config.py)。" + +#: ../../source/developer_guide/contribution/testing.md:149 +msgid "You can run the unit tests using `pytest`:" +msgstr "你可以使用 `pytest` 运行单元测试:" + +#: ../../source/developer_guide/contribution/testing.md +#, fuzzy +msgid "Single-card" +msgstr "单张卡片" + +#: ../../source/developer_guide/contribution/testing.md +#, fuzzy +msgid "Multi-card" +msgstr "多卡" + +#: ../../source/developer_guide/contribution/testing.md:196 +msgid "E2E test" +msgstr "端到端测试" + +#: ../../source/developer_guide/contribution/testing.md:198 +#, fuzzy +msgid "" +"Although vllm-kunlun CI provides the [E2E test](https://github.com/vllm-" +"project/vllm-kunlun/blob/main/.github/workflows/vllm_kunlun_test.yaml) on" +" Kunlun CI, you can run it locally." +msgstr "" +"虽然 vllm-kunlun CI 在 Kunlun CI 上提供了 [端到端测试](https://github.com/vllm-" +"project/vllm-" +"kunlun/blob/main/.github/workflows/vllm_kunlun_test.yaml),你也可以在本地运行它。" + +#: ../../source/developer_guide/contribution/testing.md:208 +#, fuzzy +msgid "You can't run the E2E test on CPUs." +msgstr "你无法在 CPU 上运行 e2e 测试。" + +#: ../../source/developer_guide/contribution/testing.md:247 +#, fuzzy +msgid "" +"This will reproduce the E2E test. See " +"[vllm_kunlun_test.yaml](https://github.com/vllm-project/vllm-" +"kunlun/blob/main/.github/workflows/vllm_kunlun_test.yaml)." +msgstr "" +"这将复现端到端测试:[vllm_kunlun_test.yaml](https://github.com/vllm-project/vllm-" +"kunlun/blob/main/.github/workflows/vllm_kunlun_test.yaml)。" + +#: ../../source/developer_guide/contribution/testing.md:249 +msgid "E2E test example:" +msgstr "E2E 测试示例:" + +#: ../../source/developer_guide/contribution/testing.md:251 +msgid "" +"Offline test example: " +"[`tests/e2e/singlecard/test_offline_inference.py`](https://github.com" +"/vllm-project/vllm-" +"kunlun/blob/main/tests/e2e/singlecard/test_offline_inference.py)" +msgstr "" +"离线测试示例:[`tests/e2e/singlecard/test_offline_inference.py`](https://github.com" +"/vllm-project/vllm-" +"kunlun/blob/main/tests/e2e/singlecard/test_offline_inference.py)" + +#: ../../source/developer_guide/contribution/testing.md:252 +msgid "" +"Online test examples: " +"[`tests/e2e/singlecard/test_prompt_embedding.py`](https://github.com" +"/vllm-project/vllm-" +"kunlun/blob/main/tests/e2e/singlecard/test_prompt_embedding.py)" +msgstr "" +"在线测试示例:[`tests/e2e/singlecard/test_prompt_embedding.py`](https://github.com" +"/vllm-project/vllm-" +"kunlun/blob/main/tests/e2e/singlecard/test_prompt_embedding.py)" + +#: ../../source/developer_guide/contribution/testing.md:253 +msgid "" +"Correctness test example: " +"[`tests/e2e/singlecard/test_aclgraph.py`](https://github.com/vllm-project" +"/vllm-kunlun/blob/main/tests/e2e/singlecard/test_aclgraph.py)" +msgstr "" +"正确性测试示例:[`tests/e2e/singlecard/test_aclgraph.py`](https://github.com" +"/vllm-project/vllm-" +"kunlun/blob/main/tests/e2e/singlecard/test_aclgraph.py)" + +#: ../../source/developer_guide/contribution/testing.md:254 +msgid "" +"Reduced Layer model test example: [test_torchair_graph_mode.py - " +"DeepSeek-V3-Pruning](https://github.com/vllm-project/vllm-" +"kunlun/blob/20767a043cccb3764214930d4695e53941de87ec/tests/e2e/multicard/test_torchair_graph_mode.py#L48)" +msgstr "" +"简化层模型测试示例:[test_torchair_graph_mode.py - " +"DeepSeek-V3-Pruning](https://github.com/vllm-project/vllm-" +"kunlun/blob/20767a043cccb3764214930d4695e53941de87ec/tests/e2e/multicard/test_torchair_graph_mode.py#L48)" + +#: ../../source/developer_guide/contribution/testing.md:256 +#, fuzzy +msgid "" +"The CI resource is limited, and you might need to reduce the number of " +"layers of a model. Below is an example of how to generate a reduced layer" +" model:" +msgstr "CI 资源有限,您可能需要减少模型的层数,下面是一个生成减少层数模型的示例:" + +#: ../../source/developer_guide/contribution/testing.md:257 +#, fuzzy +msgid "" +"Fork the original model repo in modelscope. All the files in the repo " +"except for weights are required." +msgstr "在 modelscope 中 fork 原始模型仓库,我们需要仓库中的所有文件,除了权重文件。" + +#: ../../source/developer_guide/contribution/testing.md:258 +#, python-brace-format +msgid "" +"Set `num_hidden_layers` to the expected number of layers, e.g., " +"`{\"num_hidden_layers\": 2,}`" +msgstr "将 `num_hidden_layers` 设置为期望的层数,例如 `{\"num_hidden_layers\": 2,}`" + +#: ../../source/developer_guide/contribution/testing.md:259 +msgid "" +"Copy the following python script as `generate_random_weight.py`. Set the " +"relevant parameters `MODEL_LOCAL_PATH`, `DIST_DTYPE` and " +"`DIST_MODEL_PATH` as needed:" +msgstr "" +"将以下 Python 脚本复制为 `generate_random_weight.py`。根据需要设置相关参数 " +"`MODEL_LOCAL_PATH`、`DIST_DTYPE` 和 `DIST_MODEL_PATH`:" + +#: ../../source/developer_guide/contribution/testing.md:277 +msgid "Run doctest" +msgstr "运行 doctest" + +#: ../../source/developer_guide/contribution/testing.md:279 +#, fuzzy +msgid "" +"vllm-kunlun provides a `vllm-kunlun/tests/e2e/run_doctests.sh` command to" +" run all doctests in the doc files. The doctest is a good way to make " +"sure docs stay current and examples remain executable, which can be run " +"locally as follows:" +msgstr "" +"vllm-kunlun 提供了一个 `vllm-kunlun/tests/e2e/run_doctests.sh` 命令,用于运行文档文件中的所有" +" doctest。doctest 是确保文档保持最新且示例可执行的好方法,你可以按照以下方式在本地运行它:" + +#: ../../source/developer_guide/contribution/testing.md:287 +#, fuzzy +msgid "" +"This will reproduce the same environment as the CI. See " +"[vllm_kunlun_doctest.yaml](https://github.com/vllm-project/vllm-" +"kunlun/blob/main/.github/workflows/vllm_kunlun_doctest.yaml)." +msgstr "" +"这将复现与 CI 相同的环境:[vllm_kunlun_doctest.yaml](https://github.com/vllm-project" +"/vllm-kunlun/blob/main/.github/workflows/vllm_kunlun_doctest.yaml)。" + +#~ msgid "Multi cards test" +#~ msgstr "多卡测试" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/accuracy_report/DeepSeek-V2-Lite.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/accuracy_report/DeepSeek-V2-Lite.po new file mode 100644 index 0000000..972a668 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/accuracy_report/DeepSeek-V2-Lite.po @@ -0,0 +1,26 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun \n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/developer_guide/evaluation/accuracy_report/DeepSeek-V2-Lite.md:1 +msgid "deepseek-ai/DeepSeek-V2-Lite" +msgstr "" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/accuracy_report/Qwen2.5-VL-7B-Instruct.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/accuracy_report/Qwen2.5-VL-7B-Instruct.po new file mode 100644 index 0000000..4215a6c --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/accuracy_report/Qwen2.5-VL-7B-Instruct.po @@ -0,0 +1,26 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun \n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/developer_guide/evaluation/accuracy_report/Qwen2.5-VL-7B-Instruct.md:1 +msgid "Qwen/Qwen2.5-VL-7B-Instruct" +msgstr "" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/accuracy_report/Qwen3-30B-A3B.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/accuracy_report/Qwen3-30B-A3B.po new file mode 100644 index 0000000..cb82cf6 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/accuracy_report/Qwen3-30B-A3B.po @@ -0,0 +1,26 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun \n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/developer_guide/evaluation/accuracy_report/Qwen3-30B-A3B.md:1 +msgid "Qwen/Qwen3-30B-A3B" +msgstr "" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/accuracy_report/Qwen3-8B-Base.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/accuracy_report/Qwen3-8B-Base.po new file mode 100644 index 0000000..0ca5745 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/accuracy_report/Qwen3-8B-Base.po @@ -0,0 +1,26 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun \n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/developer_guide/evaluation/accuracy_report/Qwen3-8B-Base.md:1 +msgid "Qwen/Qwen3-8B-Base" +msgstr "" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/accuracy_report/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/accuracy_report/index.po new file mode 100644 index 0000000..f2a0fba --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/accuracy_report/index.po @@ -0,0 +1,26 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../developer_guide/evaluation/accuracy_report/index.md:1 +#: ../../developer_guide/evaluation/accuracy_report/index.md:3 +msgid "Accuracy Report" +msgstr "准确性报告" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/index.po new file mode 100644 index 0000000..e497586 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/index.po @@ -0,0 +1,26 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../developer_guide/evaluation/index.md:1 +#: ../../developer_guide/evaluation/index.md:3 +msgid "Accuracy" +msgstr "准确性" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/using_ais_bench.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/using_ais_bench.po new file mode 100644 index 0000000..9bf8ad0 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/using_ais_bench.po @@ -0,0 +1,26 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun \n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/developer_guide/evaluation/using_ais_bench.md:1 +msgid "Using AISBench" +msgstr "" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/using_evalscope.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/using_evalscope.po new file mode 100644 index 0000000..d132aec --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/using_evalscope.po @@ -0,0 +1,100 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/developer_guide/evaluation/using_evalscope.md:1 +msgid "Using EvalScope" +msgstr "使用 EvalScope" + +#~ msgid "" +#~ "This document will guide you have " +#~ "model inference stress testing and " +#~ "accuracy testing using " +#~ "[EvalScope](https://github.com/modelscope/evalscope)." +#~ msgstr "" +#~ "本文档将指导您如何使用 [EvalScope](https://github.com/modelscope/evalscope)" +#~ " 进行模型推理压力测试和精度测试。" + +#~ msgid "1. Online serving" +#~ msgstr "1. 在线服务" + +#~ msgid "You can run docker container to start the vLLM server on a single XPU:" +#~ msgstr "你可以运行 docker 容器,在单个 XPU 上启动 vLLM 服务器:" + +#~ msgid "If your service start successfully, you can see the info shown below:" +#~ msgstr "如果你的服务启动成功,你会看到如下所示的信息:" + +#~ msgid "" +#~ "Once your server is started, you " +#~ "can query the model with input " +#~ "prompts in new terminal:" +#~ msgstr "一旦你的服务器启动后,你可以在新的终端中用输入提示词查询模型:" + +#~ msgid "2. Install EvalScope using pip" +#~ msgstr "2. 使用 pip 安装 EvalScope" + +#~ msgid "You can install EvalScope by using:" +#~ msgstr "你可以使用以下方式安装 EvalScope:" + +#~ msgid "3. Run gsm8k accuracy test using EvalScope" +#~ msgstr "3. 使用 EvalScope 运行 gsm8k 准确率测试" + +#~ msgid "You can `evalscope eval` run gsm8k accuracy test:" +#~ msgstr "你可以使用 `evalscope eval` 运行 gsm8k 准确率测试:" + +#~ msgid "After 1-2 mins, the output is as shown below:" +#~ msgstr "1-2 分钟后,输出如下所示:" + +#~ msgid "" +#~ "See more detail in: [EvalScope doc " +#~ "- Model API Service " +#~ "Evaluation](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html" +#~ "#model-api-service-evaluation)." +#~ msgstr "" +#~ "更多详情请见:[EvalScope 文档 - 模型 API " +#~ "服务评测](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html" +#~ "#model-api-service-evaluation)。" + +#~ msgid "4. Run model inference stress testing using EvalScope" +#~ msgstr "4. 使用 EvalScope 运行模型推理压力测试" + +#~ msgid "Install EvalScope[perf] using pip" +#~ msgstr "使用 pip 安装 EvalScope[perf]" + +#~ msgid "Basic usage" +#~ msgstr "基本用法" + +#~ msgid "You can use `evalscope perf` run perf test:" +#~ msgstr "你可以使用 `evalscope perf` 运行性能测试:" + +#~ msgid "Output results" +#~ msgstr "输出结果" + +#~ msgid "" +#~ "See more detail in: [EvalScope doc " +#~ "- Model Inference Stress " +#~ "Testing](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html" +#~ "#basic-usage)." +#~ msgstr "" +#~ "更多详情见:[EvalScope 文档 - " +#~ "模型推理压力测试](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html" +#~ "#basic-usage)。" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/using_lm_eval.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/using_lm_eval.po new file mode 100644 index 0000000..96bafa5 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/using_lm_eval.po @@ -0,0 +1,62 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/developer_guide/evaluation/using_lm_eval.md:1 +msgid "Using lm-eval" +msgstr "使用 lm-eval" + +#~ msgid "" +#~ "This document will guide you have " +#~ "a accuracy testing using [lm-" +#~ "eval](https://github.com/EleutherAI/lm-evaluation-" +#~ "harness)." +#~ msgstr "" +#~ "本文将指导你如何使用 [lm-eval](https://github.com/EleutherAI/lm-" +#~ "evaluation-harness) 进行准确率测试。" + +#~ msgid "1. Run docker container" +#~ msgstr "1. 运行 docker 容器" + +#~ msgid "You can run docker container on a single XPU:" +#~ msgstr "你可以在单个XPU上运行docker容器:" + +#~ msgid "2. Run ceval accuracy test using lm-eval" +#~ msgstr "2. 使用 lm-eval 运行 ceval 准确性测试" + +#~ msgid "Install lm-eval in the container." +#~ msgstr "在容器中安装 lm-eval。" + +#~ msgid "Run the following command:" +#~ msgstr "运行以下命令:" + +#~ msgid "After 1-2 mins, the output is as shown below:" +#~ msgstr "1-2 分钟后,输出如下所示:" + +#~ msgid "" +#~ "You can see more usage on [Lm-" +#~ "eval Docs](https://github.com/EleutherAI/lm-evaluation-" +#~ "harness/blob/main/docs/README.md)." +#~ msgstr "" +#~ "你可以在 [Lm-eval 文档](https://github.com/EleutherAI" +#~ "/lm-evaluation-harness/blob/main/docs/README.md) " +#~ "上查看更多用法。" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/using_opencompass.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/using_opencompass.po new file mode 100644 index 0000000..6309b62 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/using_opencompass.po @@ -0,0 +1,77 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/developer_guide/evaluation/using_opencompass.md:1 +msgid "Using OpenCompass" +msgstr "使用 OpenCompass" + +#~ msgid "" +#~ "This document will guide you have " +#~ "a accuracy testing using " +#~ "[OpenCompass](https://github.com/open-compass/opencompass)." +#~ msgstr "" +#~ "本文档将指导你如何使用 [OpenCompass](https://github.com/open-" +#~ "compass/opencompass) 进行准确率测试。" + +#~ msgid "1. Online Serving" +#~ msgstr "1. 在线服务" + +#~ msgid "You can run docker container to start the vLLM server on a single XPU:" +#~ msgstr "你可以运行 docker 容器,在单个 XPU 上启动 vLLM 服务器:" + +#~ msgid "If your service start successfully, you can see the info shown below:" +#~ msgstr "如果你的服务启动成功,你会看到如下所示的信息:" + +#~ msgid "" +#~ "Once your server is started, you " +#~ "can query the model with input " +#~ "prompts in new terminal:" +#~ msgstr "一旦你的服务器启动后,你可以在新的终端中用输入提示词查询模型:" + +#~ msgid "2. Run ceval accuracy test using OpenCompass" +#~ msgstr "2. 使用 OpenCompass 运行 ceval 准确率测试" + +#~ msgid "" +#~ "Install OpenCompass and configure the " +#~ "environment variables in the container." +#~ msgstr "在容器中安装 OpenCompass 并配置环境变量。" + +#~ msgid "" +#~ "Add `opencompass/configs/eval_vllm_kunlun_demo.py` with" +#~ " the following content:" +#~ msgstr "添加 `opencompass/configs/eval_vllm_kunlun_demo.py`,内容如下:" + +#~ msgid "Run the following command:" +#~ msgstr "运行以下命令:" + +#~ msgid "After 1-2 mins, the output is as shown below:" +#~ msgstr "1-2 分钟后,输出如下所示:" + +#~ msgid "" +#~ "You can see more usage on " +#~ "[OpenCompass " +#~ "Docs](https://opencompass.readthedocs.io/en/latest/index.html)." +#~ msgstr "" +#~ "你可以在 [OpenCompass " +#~ "文档](https://opencompass.readthedocs.io/en/latest/index.html) " +#~ "查看更多用法。" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/feature_guide/ACL_Graph.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/feature_guide/ACL_Graph.po new file mode 100644 index 0000000..28f733a --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/feature_guide/ACL_Graph.po @@ -0,0 +1,26 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun \n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/developer_guide/feature_guide/ACL_Graph.md:1 +msgid "Graph" +msgstr "" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/feature_guide/KV_Cache_Pool_Guide.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/feature_guide/KV_Cache_Pool_Guide.po new file mode 100644 index 0000000..03627a1 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/feature_guide/KV_Cache_Pool_Guide.po @@ -0,0 +1,30 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun \n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/developer_guide/feature_guide/KV_Cache_Pool_Guide.md:1 +msgid "KV Cache Pool" +msgstr "" + +#: ../../source/developer_guide/feature_guide/KV_Cache_Pool_Guide.md:3 +msgid "Why KV Cache Pool?" +msgstr "" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/feature_guide/ModelRunner_prepare_inputs.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/feature_guide/ModelRunner_prepare_inputs.po new file mode 100644 index 0000000..83fc8b0 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/feature_guide/ModelRunner_prepare_inputs.po @@ -0,0 +1,26 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun \n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/developer_guide/feature_guide/ModelRunner_prepare_inputs.md:1 +msgid "Prepare inputs for model forwarding" +msgstr "" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/feature_guide/Multi_Token_Prediction.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/feature_guide/Multi_Token_Prediction.po new file mode 100644 index 0000000..a43c234 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/feature_guide/Multi_Token_Prediction.po @@ -0,0 +1,26 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun \n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/developer_guide/feature_guide/Multi_Token_Prediction.md:1 +msgid "Multi Token Prediction (MTP)" +msgstr "" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/feature_guide/eplb_swift_balancer.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/feature_guide/eplb_swift_balancer.po new file mode 100644 index 0000000..5577689 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/feature_guide/eplb_swift_balancer.po @@ -0,0 +1,26 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun \n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/developer_guide/feature_guide/eplb_swift_balancer.md:1 +msgid "Expert Parallelism Load Balancer (EPLB)" +msgstr "" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/feature_guide/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/feature_guide/index.po new file mode 100644 index 0000000..5bdc828 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/feature_guide/index.po @@ -0,0 +1,33 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../developer_guide/feature_guide/index.md:1 +#: ../../developer_guide/feature_guide/index.md:5 +msgid "Feature Guide" +msgstr "功能指南" + +#: ../../developer_guide/feature_guide/index.md:3 +msgid "" +"This section provides an overview of the features implemented in vLLM " +"Kunlun. Developers can refer to this guide to understand how vLLM Kunlun " +"works." +msgstr "本节概述了 vLLM Kunlun 中实现的功能。开发者可以参考本指南以了解 vLLM Kunlun 的工作原理。" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/feature_guide/patch.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/feature_guide/patch.po new file mode 100644 index 0000000..f325373 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/feature_guide/patch.po @@ -0,0 +1,288 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/developer_guide/feature_guide/patch.md:1 +#, fuzzy +msgid "Patch in vLLM" +msgstr "在 vLLM Kunlun 中的补丁" + +#~ msgid "" +#~ "vLLM Kunlun is a platform plugin " +#~ "for vLLM. Due to the release cycle" +#~ " of vLLM and vLLM Kunlun is " +#~ "different, and the hardware limitation " +#~ "in some case, we need to patch " +#~ "some code in vLLM to make it " +#~ "compatible with vLLM Kunlun." +#~ msgstr "" +#~ "vLLM Kunlun 是 vLLM 的一个平台插件。由于 vLLM " +#~ "和 vLLM Kunlun 的发布周期不同,并且在某些情况下存在硬件限制,我们需要对 " +#~ "vLLM 进行一些代码补丁,以使其能够兼容 vLLM Kunlun。" + +#~ msgid "" +#~ "In vLLM Kunlun code, we provide a" +#~ " patch module `vllm_kunlun/patch` to " +#~ "address the change for vLLM." +#~ msgstr "在 vLLM Kunlun 代码中,我们提供了一个补丁模块 `vllm_kunlun/patch` 用于应对 vLLM 的变更。" + +#~ msgid "Principle" +#~ msgstr "原理" + +#~ msgid "" +#~ "We should keep in mind that Patch" +#~ " is not the best way to make" +#~ " vLLM Kunlun compatible. It's just a" +#~ " temporary solution. The best way is" +#~ " to contribute the change to vLLM " +#~ "to make it compatible with vLLM " +#~ "Kunlun originally. In vLLM Kunlun, we" +#~ " have the basic principle for Patch" +#~ " strategy:" +#~ msgstr "" +#~ "我们需要记住,Patch 不是让 vLLM 兼容 Kunlun " +#~ "的最佳方式,这只是一个临时的解决方案。最好的方法是将修改贡献到 vLLM 项目中,从而让 vLLM" +#~ " 原生支持 Kunlun。对于 vLLM Kunlun,我们对 Patch " +#~ "策略有一个基本原则:" + +#~ msgid "Less is more. Please do not patch unless it's the only way currently." +#~ msgstr "少即是多。请不要打补丁,除非这是目前唯一的方法。" + +#~ msgid "" +#~ "Once a patch is added, it's " +#~ "required to describe the future plan " +#~ "for removing the patch." +#~ msgstr "一旦补丁被添加,必须说明将来移除该补丁的计划。" + +#~ msgid "Anytime, clean the patch code is welcome." +#~ msgstr "任何时候,欢迎清理补丁代码。" + +#~ msgid "How it works" +#~ msgstr "工作原理" + +#~ msgid "In `vllm_kunlun/patch`, you can see the code structure as follows:" +#~ msgstr "在 `vllm_kunlun/patch` 目录中,你可以看到如下代码结构:" + +#~ msgid "" +#~ "**platform**: The patch code in this " +#~ "directory is for patching the code " +#~ "in vLLM main process. It's called " +#~ "by `vllm_kunlun/platform::XPUPlatform::pre_register_and_update`" +#~ " very early when vLLM is initialized." +#~ msgstr "" +#~ "**platform**:此目录下的补丁代码用于修补 vLLM 主进程中的代码。当 vLLM " +#~ "初始化时,会在很早的阶段由 " +#~ "`vllm_kunlun/platform::XPUPlatform::pre_register_and_update` 调用。" + +#~ msgid "" +#~ "For online mode, vLLM process calls " +#~ "the platform patch here " +#~ "`vllm/vllm/engine/arg_utils.py::AsyncEngineArgs.add_cli_args` " +#~ "when parsing the cli args." +#~ msgstr "" +#~ "对于在线模式,vLLM 进程在解析命令行参数时,会在 " +#~ "`vllm/vllm/engine/arg_utils.py::AsyncEngineArgs.add_cli_args` " +#~ "这里调用平台补丁。" + +#~ msgid "" +#~ "For offline mode, vLLM process calls " +#~ "the platform patch here " +#~ "`vllm/vllm/engine/arg_utils.py::EngineArgs.create_engine_config` " +#~ "when parsing the input parameters." +#~ msgstr "" +#~ "对于离线模式,vLLM 进程在解析输入参数时,会在此处调用平台补丁 " +#~ "`vllm/vllm/engine/arg_utils.py::EngineArgs.create_engine_config`。" + +#~ msgid "" +#~ "**worker**: The patch code in this " +#~ "directory is for patching the code " +#~ "in vLLM worker process. It's called " +#~ "by `vllm_kunlun/worker/worker_v1::XPUWorker::__init__` " +#~ "when the vLLM worker process is " +#~ "initialized." +#~ msgstr "" +#~ "**worker**:此目录中的补丁代码用于修补 vLLM worker 进程中的代码。在初始化 " +#~ "vLLM worker 进程时,会被 " +#~ "`vllm_kunlun/worker/worker_v1::XPUWorker::__init__` 调用。" + +#~ msgid "" +#~ "For both online and offline mode, " +#~ "vLLM engine core process calls the " +#~ "worker patch here " +#~ "`vllm/vllm/worker/worker_base.py::WorkerWrapperBase.init_worker` " +#~ "when initializing the worker process." +#~ msgstr "" +#~ "无论是在线还是离线模式,vLLM 引擎核心进程在初始化 worker 进程时,都会在这里调用 " +#~ "worker " +#~ "补丁:`vllm/vllm/worker/worker_base.py::WorkerWrapperBase.init_worker`。" + +#~ msgid "" +#~ "In both **platform** and **worker** " +#~ "folder, there are several patch modules." +#~ " They are used for patching different" +#~ " version of vLLM." +#~ msgstr "在 **platform** 和 **worker** 文件夹中都有一些补丁模块。它们用于修补不同版本的 vLLM。" + +#~ msgid "" +#~ "`patch_0_9_2`: This module is used for" +#~ " patching vLLM 0.9.2. The version is" +#~ " always the nearest version of vLLM." +#~ " Once vLLM is released, we will " +#~ "drop this patch module and bump to" +#~ " a new version. For example, " +#~ "`patch_0_9_2` is used for patching vLLM" +#~ " 0.9.2." +#~ msgstr "" +#~ "`patch_0_9_2`:此模块用于修补 vLLM 0.9.2。该版本始终对应于 vLLM " +#~ "的最近版本。一旦 vLLM 发布新版本,我们将移除此补丁模块并升级到新版本。例如,`patch_0_9_2` " +#~ "就是用于修补 vLLM 0.9.2 的。" + +#~ msgid "" +#~ "`patch_main`: This module is used for" +#~ " patching the code in vLLM main " +#~ "branch." +#~ msgstr "`patch_main`:该模块用于修补 vLLM 主分支代码。" + +#~ msgid "" +#~ "`patch_common`: This module is used for" +#~ " patching both vLLM 0.9.2 and vLLM" +#~ " main branch." +#~ msgstr "`patch_common`:此模块用于同时修补 vLLM 0.9.2 版本和 vLLM 主分支。" + +#~ msgid "How to write a patch" +#~ msgstr "如何撰写补丁" + +#~ msgid "" +#~ "Before writing a patch, following the" +#~ " principle above, we should patch the" +#~ " least code. If it's necessary, we" +#~ " can patch the code in either " +#~ "**platform** and **worker** folder. Here " +#~ "is an example to patch `distributed` " +#~ "module in vLLM." +#~ msgstr "" +#~ "在编写补丁之前,遵循上述原则,我们应尽量修改最少的代码。如果有必要,我们可以修改 **platform** 和" +#~ " **worker** 文件夹中的代码。下面是一个在 vLLM 中修改 " +#~ "`distributed` 模块的示例。" + +#~ msgid "" +#~ "Decide which version of vLLM we " +#~ "should patch. For example, after " +#~ "analysis, here we want to patch " +#~ "both 0.9.2 and main of vLLM." +#~ msgstr "决定我们应该修补哪个版本的 vLLM。例如,经过分析后,这里我们想要同时修补 vLLM 的 0.9.2 版和主分支(main)。" + +#~ msgid "" +#~ "Decide which process we should patch." +#~ " For example, here `distributed` belongs" +#~ " to the vLLM main process, so " +#~ "we should patch `platform`." +#~ msgstr "决定我们应该修补哪个进程。例如,这里 `distributed` 属于 vLLM 主进程,所以我们应该修补 `platform`。" + +#~ msgid "" +#~ "Create the patch file in the right" +#~ " folder. The file should be named " +#~ "as `patch_{module_name}.py`. The example here" +#~ " is " +#~ "`vllm_kunlun/patch/platform/patch_common/patch_distributed.py`." +#~ msgstr "" +#~ "在正确的文件夹中创建补丁文件。文件应命名为 `patch_{module_name}.py`。此处的示例是 " +#~ "`vllm_kunlun/patch/platform/patch_common/patch_distributed.py`。" + +#~ msgid "Write your patch code in the new file. Here is an example:" +#~ msgstr "在新文件中编写你的补丁代码。以下是一个示例:" + +#~ msgid "" +#~ "Import the patch file in `__init__.py`." +#~ " In this example, add `import " +#~ "vllm_kunlun.patch.platform.patch_common.patch_distributed` into" +#~ " `vllm_kunlun/patch/platform/patch_common/__init__.py`." +#~ msgstr "" +#~ "在 `__init__.py` 中导入补丁文件。在这个示例中,将 `import " +#~ "vllm_kunlun.patch.platform.patch_common.patch_distributed` 添加到" +#~ " `vllm_kunlun/patch/platform/patch_common/__init__.py` 中。" + +#~ msgid "" +#~ "Add the description of the patch " +#~ "in `vllm_kunlun/patch/__init__.py`. The description" +#~ " format is as follows:" +#~ msgstr "在 `vllm_kunlun/patch/__init__.py` 中添加补丁的描述。描述格式如下:" + +#~ msgid "" +#~ "Add the Unit Test and E2E Test." +#~ " Any newly added code in vLLM " +#~ "Kunlun should contain the Unit Test " +#~ "and E2E Test as well. You can " +#~ "find more details in [test " +#~ "guide](../contribution/testing.md)" +#~ msgstr "" +#~ "添加单元测试和端到端(E2E)测试。在 vLLM Kunlun " +#~ "中新增的任何代码也应包含单元测试和端到端测试。更多详情请参见 " +#~ "[测试指南](../contribution/testing.md)。" + +#~ msgid "Limitation" +#~ msgstr "限制" + +#~ msgid "" +#~ "In V1 Engine, vLLM starts three " +#~ "kinds of process: Main process, " +#~ "EngineCore process and Worker process. " +#~ "Now vLLM Kunlun only support patch " +#~ "the code in Main process and " +#~ "Worker process by default. If you " +#~ "want to patch the code runs in " +#~ "EngineCore process, you should patch " +#~ "EngineCore process entirely during setup, " +#~ "the entry code is here " +#~ "`vllm.v1.engine.core`. Please override " +#~ "`EngineCoreProc` and `DPEngineCoreProc` entirely." +#~ msgstr "" +#~ "在 V1 引擎中,vLLM 会启动三种类型的进程:主进程、EngineCore 进程和" +#~ " Worker 进程。现在 vLLM Kunlun 默认只支持在主进程和 " +#~ "Worker 进程中打补丁代码。如果你想要在 EngineCore 进程中打补丁,你需要在设置阶段对" +#~ " EngineCore 进程整体打补丁,入口代码在 `vllm.v1.engine.core`。请完全重写" +#~ " `EngineCoreProc` 和 `DPEngineCoreProc`。" + +#~ msgid "" +#~ "If you are running an edited vLLM" +#~ " code, the version of the vLLM " +#~ "may be changed automatically. For " +#~ "example, if you runs an edited " +#~ "vLLM based on v0.9.n, the version " +#~ "of vLLM may be change to " +#~ "v0.9.nxxx, in this case, the patch " +#~ "for v0.9.n in vLLM Kunlun would " +#~ "not work as expect, because that " +#~ "vLLM Kunlun can't distinguish the " +#~ "version of vLLM you're using. In " +#~ "this case, you can set the " +#~ "environment variable `VLLM_VERSION` to specify" +#~ " the version of vLLM you're using," +#~ " then the patch for v0.9.2 should " +#~ "work." +#~ msgstr "" +#~ "如果你运行的是经过编辑的 vLLM 代码,vLLM 的版本可能会被自动更改。例如,如果你基于 " +#~ "v0.9.n 运行了编辑后的 vLLM,vLLM 的版本可能会变为 " +#~ "v0.9.nxxx,在这种情况下,vLLM Kunlun 的 v0.9.n " +#~ "补丁将无法正常工作,因为 vLLM Kunlun 无法区分你所使用的 vLLM " +#~ "版本。这时,你可以设置环境变量 `VLLM_VERSION` 来指定你所使用的 vLLM " +#~ "版本,这样对 v0.9.2 的补丁就应该可以正常工作。" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/modeling/adding_a_new_model.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/modeling/adding_a_new_model.po new file mode 100644 index 0000000..c16f18d --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/modeling/adding_a_new_model.po @@ -0,0 +1,333 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../developer_guide/modeling/adding_a_new_model.md:1 +msgid "Adding a New Model" +msgstr "添加新模型" + +#: ../../developer_guide/modeling/adding_a_new_model.md:3 +msgid "" +"This guide demonstrates how to integrate a novel or customized model into " +"vllm-kunlun. For foundational concepts, it is highly recommended to refer to" +" [vllm official doc: Adding a New " +"Model](https://docs.vllm.ai/en/stable/contributing/model/) first." +msgstr "" +"本指南演示如何将新颖或自定义的模型集成到 vllm-kunlun 中。对于基础概念,强烈建议先参考 [vllm " +"官方文档:添加新模型](https://docs.vllm.ai/en/stable/contributing/model/)。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:6 +msgid "Step 1: Implementing Models with `torch` and `torch_npu`" +msgstr "步骤 1:使用 `torch` 和 `torch_npu` 实现模型" + +#: ../../developer_guide/modeling/adding_a_new_model.md:8 +msgid "" +"This section provides instructions for implementing new models compatible " +"with vllm and vllm-kunlun." +msgstr "本节提供了实现与 vllm 和 vllm-kunlun 兼容的新模型的相关说明。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:10 +msgid "**Before starting:**" +msgstr "**开始之前:**" + +#: ../../developer_guide/modeling/adding_a_new_model.md:12 +msgid "" +"Verify whether your model already exists in vllm's " +"[models](https://github.com/vllm-" +"project/vllm/tree/main/vllm/model_executor/models) directory." +msgstr "" +"请确认你的模型是否已经存在于 vllm 的 [models](https://github.com/vllm-" +"project/vllm/tree/main/vllm/model_executor/models) 目录中。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:13 +msgid "" +"Use existing models' implementation as templates to accelerate your " +"development." +msgstr "使用已有模型的实现作为模板以加速您的开发。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:15 +msgid "Method 1: Implementing New Models from Scratch" +msgstr "方法一:从零开始实现新模型" + +#: ../../developer_guide/modeling/adding_a_new_model.md:17 +msgid "" +"Follow vllm's [OPT model " +"adaptation](https://docs.vllm.ai/en/stable/contributing/model/basic.html) " +"example for guidance." +msgstr "" +"请参考 vllm 的 [OPT " +"模型适配](https://docs.vllm.ai/en/stable/contributing/model/basic.html) 示例进行操作。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:19 +msgid "**Key implementation requirements:**" +msgstr "**关键实现要求:**" + +#: ../../developer_guide/modeling/adding_a_new_model.md:21 +msgid "Place model files in `vllm_kunlun/models/` directory." +msgstr "请将模型文件放在 `vllm_kunlun/models/` 目录下。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:23 +msgid "" +"Standard module structure for decoder-only LLMs (please checkout vllm's " +"implementations for other kinds of model):" +msgstr "解码器-only LLMs 的标准模块结构(请参考 vllm 对其他类型模型的实现):" + +#: ../../developer_guide/modeling/adding_a_new_model.md:25 +msgid "`*ModelForCausalLM` (top-level wrapper)" +msgstr "`*ModelForCausalLM`(顶层包装器)" + +#: ../../developer_guide/modeling/adding_a_new_model.md:26 +msgid "`*Model` (main architecture)" +msgstr "`*Model`(主架构)" + +#: ../../developer_guide/modeling/adding_a_new_model.md:27 +msgid "`*DecoderLayer` (transformer block)" +msgstr "`*DecoderLayer` (transformer 块)" + +#: ../../developer_guide/modeling/adding_a_new_model.md:28 +msgid "`*Attention` and `*MLP` (specific computation unit)" +msgstr "`*Attention` 和 `*MLP`(特定计算单元)" + +#: ../../developer_guide/modeling/adding_a_new_model.md:31 +msgid "`*` denotes your model's unique identifier." +msgstr "`*` 表示你的模型的唯一标识符。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:34 +msgid "Critical Implementation Details:" +msgstr "关键实现细节:" + +#: ../../developer_guide/modeling/adding_a_new_model.md:36 +msgid "All modules must include a `prefix` argument in `__init__()`." +msgstr "所有模块在 `__init__()` 方法中都必须包含一个 `prefix` 参数。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:38 +msgid "**Required interfaces:**" +msgstr "**必需的接口:**" + +#: ../../developer_guide/modeling/adding_a_new_model.md:30 +msgid "Module Type" +msgstr "模块类型" + +#: ../../developer_guide/modeling/adding_a_new_model.md:30 +msgid "Required Methods" +msgstr "必需的方法" + +#: ../../developer_guide/modeling/adding_a_new_model.md:30 +msgid "`*ModelForCausalLM`" +msgstr "`*ModelForCausalLM`" + +#: ../../developer_guide/modeling/adding_a_new_model.md:30 +msgid "`get_input_embeddings`, `compute_logits`, `load_weights`" +msgstr "`get_input_embeddings`,`compute_logits`,`load_weights`" + +#: ../../developer_guide/modeling/adding_a_new_model.md:30 +msgid "`*Model`" +msgstr "`*模型`" + +#: ../../developer_guide/modeling/adding_a_new_model.md:30 +msgid "`get_input_embeddings`, `load_weights`" +msgstr "`get_input_embeddings`,`load_weights`" + +#: ../../developer_guide/modeling/adding_a_new_model.md:45 +msgid "Attention Backend Integration:" +msgstr "注意后端集成:" + +#: ../../developer_guide/modeling/adding_a_new_model.md:47 +msgid "" +"Importing attention via `from vllm.attention import Attention` can " +"automatically leverage the attention backend routing of vllm-kunlun (see: " +"`get_attn_backend_cls()` in `vllm_kunlun/platform.py`)." +msgstr "" +"通过 `from vllm.attention import Attention` 导入 attention 可以自动利用 vllm-kunlun " +"的注意力后端路由(详见:`vllm_kunlun/platform.py` 中的 `get_attn_backend_cls()`)。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:49 +msgid "Tensor Parallelism:" +msgstr "张量并行:" + +#: ../../developer_guide/modeling/adding_a_new_model.md:51 +msgid "" +"Use vllm's parallel layers (`ColumnParallelLinear`, " +"`VocabParallelEmbedding`, etc.) to implement models supporting tensor " +"parallelism. Note that Kunlun-specific customizations are implemented in " +"`vllm_kunlun/ops/` directory (RMSNorm, VocabParallelEmbedding, etc.)." +msgstr "" +"使用 vllm 的并行层(如 `ColumnParallelLinear`、`VocabParallelEmbedding` " +"等)来实现支持张量并行的模型。需要注意的是,Kunlun 特有的自定义实现(如 RMSNorm、VocabParallelEmbedding 等)位于 " +"`vllm_kunlun/ops/` 目录下。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:53 +msgid "" +"**Reference Implementation Template** (assumed path: " +"`vllm_kunlun/models/custom_model.py`):" +msgstr "**参考实现模板**(假定路径:`vllm_kunlun/models/custom_model.py`):" + +#: ../../developer_guide/modeling/adding_a_new_model.md:135 +msgid "Method 2: Customizing Existing vLLM Models" +msgstr "方法二:自定义已有的 vLLM 模型" + +#: ../../developer_guide/modeling/adding_a_new_model.md:137 +msgid "" +"For most use cases, extending existing implementations is preferable. We " +"demonstrate an example to inherit from base classes and implement a custom " +"deepseek model below (assumed path: `vllm_kunlun/models/deepseek_v2.py`)." +msgstr "" +"对于大多数使用场景,建议扩展已有的实现。我们在下面演示了一个示例,通过继承基类并实现一个自定义的 deepseek " +"模型(假定路径:`vllm_kunlun/models/deepseek_v2.py`)。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:175 +msgid "" +"For a complete implementation reference, see: " +"`vllm_kunlun/models/deepseek_v2.py`." +msgstr "完整的实现参考请见:`vllm_kunlun/models/deepseek_v2.py`。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:178 +msgid "Step 2: Registering Custom Models using ModelRegistry Plugins in vLLM" +msgstr "第2步:使用 vLLM 中的 ModelRegistry 插件注册自定义模型" + +#: ../../developer_guide/modeling/adding_a_new_model.md:180 +msgid "" +"vllm provides a plugin mechanism for registering externally implemented " +"models without modifying its codebase." +msgstr "vllm 提供了一种插件机制,可用于注册外部实现的模型,而无需修改其代码库。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:182 +msgid "" +"To integrate your implemented model from `vllm_kunlun/models/` directory:" +msgstr "要集成你在 `vllm_kunlun/models/` 目录下实现的模型:" + +#: ../../developer_guide/modeling/adding_a_new_model.md:184 +msgid "" +"Import your model implementation in `vllm_kunlun/models/__init__.py` using " +"relative imports." +msgstr "使用相对导入在 `vllm_kunlun/models/__init__.py` 中导入你的模型实现。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:185 +msgid "" +"Register the model wrapper class via `vllm.ModelRegistry.register_model()` " +"function." +msgstr "通过 `vllm.ModelRegistry.register_model()` 函数注册模型包装类。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:187 +msgid "" +"**Reference Registration Template** (an example of registering new models in" +" `vllm_kunlun/models/__init__.py`):" +msgstr "**参考注册模板**(在 `vllm_kunlun/models/__init__.py` 注册新模型的示例):" + +#: ../../developer_guide/modeling/adding_a_new_model.md:210 +msgid "" +"The first argument of `vllm.ModelRegistry.register_model()` indicates the " +"unique architecture identifier which must match `architectures` in " +"`config.json` of the model." +msgstr "" +"`vllm.ModelRegistry.register_model()` 的第一个参数表示唯一的架构标识符,这个标识符必须与模型的 " +"`config.json` 文件中的 `architectures` 匹配。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:221 +msgid "Step 3: Verification" +msgstr "第 3 步:验证" + +#: ../../developer_guide/modeling/adding_a_new_model.md:223 +msgid "Case 1: Overriding Existing vLLM Model Architecture" +msgstr "案例 1:重载已有的 vLLM 模型架构" + +#: ../../developer_guide/modeling/adding_a_new_model.md:225 +msgid "" +"If you're registering a customized model architecture based on vllm's " +"existing implementation (overriding vllm's original class), when executing " +"vllm offline/online inference (using any model), you'll observe warning logs" +" similar to the following output from " +"`vllm/models_executor/models/registry.py`." +msgstr "" +"如果你基于 vllm 的现有实现注册了一个自定义的模型架构(覆盖了 vllm 的原始类),在执行 vllm " +"的离线/在线推理(无论使用哪个模型)时,你会看到类似于 `vllm/models_executor/models/registry.py` " +"输出的警告日志。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:231 +msgid "Case 2: Registering New Model Architecture" +msgstr "案例2:注册新模型架构" + +#: ../../developer_guide/modeling/adding_a_new_model.md:233 +msgid "" +"If you're registering a novel model architecture not present in vllm " +"(creating a completely new class), current logs won't provide explicit " +"confirmation by default. It's recommended to add the following logging " +"statement at the end of the `register_model` method in " +"`vllm/models_executor/models/registry.py`." +msgstr "" +"如果你注册了 vllm 中不存在的新模型架构(创建一个全新的类),当前日志默认不会提供明确的确认信息。建议在 " +"`vllm/models_executor/models/registry.py` 文件中的 `register_model` " +"方法末尾添加如下日志语句。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:239 +msgid "" +"After adding this line, you will see confirmation logs shown below when " +"running vllm offline/online inference (using any model)." +msgstr "添加这一行之后,当你运行 vllm 离线/在线推理(使用任何模型)时,将会看到如下确认日志。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:245 +msgid "" +"This log output confirms your novel model architecture has been successfully" +" registered in vllm." +msgstr "该日志输出确认了你的新模型架构已成功在 vllm 中注册。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:247 +msgid "Step 4: Testing" +msgstr "第4步:测试" + +#: ../../developer_guide/modeling/adding_a_new_model.md:249 +msgid "" +"After adding a new model, we should do basic functional test (offline/online" +" inference), accuracy test and performance benchmark for the model." +msgstr "在添加新模型后,我们应对该模型进行基本功能测试(离线/在线推理)、准确率测试和性能基准测试。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:251 +msgid "Find more details at:" +msgstr "更多详情请见:" + +#: ../../developer_guide/modeling/adding_a_new_model.md:253 +msgid "" +"[Accuracy test guide](https://vllm-" +"kunlun.readthedocs.io/en/latest/developer_guide/evaluation/index.html)" +msgstr "" +"[精度测试指南](https://vllm-" +"kunlun.readthedocs.io/en/latest/developer_guide/evaluation/index.html)" + +#: ../../developer_guide/modeling/adding_a_new_model.md:254 +msgid "" +"[Performance benchmark guide](https://vllm-" +"kunlun.readthedocs.io/en/latest/developer_guide/performance/performance_benchmark.html)" +msgstr "" +"[性能基准指南](https://vllm-" +"kunlun.readthedocs.io/en/latest/developer_guide/performance/performance_benchmark.html)" + +#: ../../developer_guide/modeling/adding_a_new_model.md:256 +msgid "Step 5: Updating Supported Models Doc" +msgstr "第5步:更新支持的模型文档" + +#: ../../developer_guide/modeling/adding_a_new_model.md:258 +msgid "" +"At last, if all the steps above are completed, you should add the new model " +"into our [Supported Models](https://vllm-" +"kunlun.readthedocs.io/en/latest/user_guide/supported_models.html) doc." +msgstr "" +"最后,如果以上所有步骤都已完成,你应该将新模型添加到我们的[支持的模型](https://vllm-" +"kunlun.readthedocs.io/en/latest/user_guide/supported_models.html)文档中。" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/modeling/adding_a_new_multimodal_model.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/modeling/adding_a_new_multimodal_model.po new file mode 100644 index 0000000..82eb924 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/modeling/adding_a_new_multimodal_model.po @@ -0,0 +1,29 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../developer_guide/modeling/adding_a_new_multimodal_model.md:1 +msgid "Adding a New Multi-Modal Model" +msgstr "添加新的多模态模型" + +#: ../../developer_guide/modeling/adding_a_new_multimodal_model.md:3 +msgid "**_Comming soon ..._**" +msgstr "**_敬请期待 ..._**" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/modeling/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/modeling/index.po new file mode 100644 index 0000000..d333038 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/modeling/index.po @@ -0,0 +1,32 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../developer_guide/modeling/index.md:1 +#: ../../developer_guide/modeling/index.md:5 +msgid "Modeling" +msgstr "新模型" + +#: ../../developer_guide/modeling/index.md:3 +msgid "" +"This section provides tutorials of how to implement and register a new model" +" into vllm-kunlun." +msgstr "本节提供了如何在 vllm-kunlun 中实现并注册新模型的教程。" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/performance/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/performance/index.po new file mode 100644 index 0000000..827dad7 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/performance/index.po @@ -0,0 +1,26 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../developer_guide/performance/index.md:1 +#: ../../developer_guide/performance/index.md:3 +msgid "Performance" +msgstr "性能" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/performance/optimization_and_tuning.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/performance/optimization_and_tuning.po new file mode 100644 index 0000000..9cc86d2 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/performance/optimization_and_tuning.po @@ -0,0 +1,26 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun \n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/developer_guide/performance/optimization_and_tuning.md:1 +msgid "Optimization and Tuning" +msgstr "" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/performance/performance_benchmark.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/performance/performance_benchmark.po new file mode 100644 index 0000000..3cb144f --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/performance/performance_benchmark.po @@ -0,0 +1,92 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/developer_guide/performance/performance_benchmark.md:1 +msgid "Performance Benchmark" +msgstr "性能基准" + +#~ msgid "" +#~ "This document details the benchmark " +#~ "methodology for vllm-kunlun, aimed at" +#~ " evaluating the performance under a " +#~ "variety of workloads. To maintain " +#~ "alignment with vLLM, we use the " +#~ "[benchmark](https://github.com/vllm-" +#~ "project/vllm/tree/main/benchmarks) script provided " +#~ "by the vllm project." +#~ msgstr "" +#~ "本文档详细说明了 vllm-kunlun 的基准测试方法,旨在评估其在多种工作负载下的性能。为了与" +#~ " vLLM 保持一致,我们使用 vllm 项目提供的 " +#~ "[benchmark](https://github.com/vllm-" +#~ "project/vllm/tree/main/benchmarks) 脚本。" + +#~ msgid "" +#~ "**Benchmark Coverage**: We measure offline " +#~ "e2e latency and throughput, and " +#~ "fixed-QPS online serving benchmarks, for" +#~ " more details see [vllm-kunlun " +#~ "benchmark scripts](https://github.com/vllm-project" +#~ "/vllm-kunlun/tree/main/benchmarks)." +#~ msgstr "" +#~ "**基准测试覆盖范围**:我们测量离线端到端延迟和吞吐量,以及固定 QPS 的在线服务基准测试。更多详情请参见" +#~ " [vllm-kunlun 基准测试脚本](https://github.com/vllm-" +#~ "project/vllm-kunlun/tree/main/benchmarks)。" + +#~ msgid "1. Run docker container" +#~ msgstr "1. 运行 docker 容器" + +#~ msgid "2. Install dependencies" +#~ msgstr "2. 安装依赖项" + +#~ msgid "3. (Optional)Prepare model weights" +#~ msgstr "3.(可选)准备模型权重" + +#~ msgid "" +#~ "For faster running speed, we recommend" +#~ " downloading the model in advance:" +#~ msgstr "为了更快的运行速度,建议提前下载模型:" + +#~ msgid "" +#~ "You can also replace all model " +#~ "paths in the [json](https://github.com/vllm-" +#~ "project/vllm-kunlun/tree/main/benchmarks/tests) files " +#~ "with your local paths:" +#~ msgstr "" +#~ "你也可以将 [json](https://github.com/vllm-project/vllm-" +#~ "kunlun/tree/main/benchmarks/tests) 文件中的所有模型路径替换为你的本地路径:" + +#~ msgid "4. Run benchmark script" +#~ msgstr "4. 运行基准测试脚本" + +#~ msgid "Run benchmark script:" +#~ msgstr "运行基准测试脚本:" + +#~ msgid "After about 10 mins, the output is as shown below:" +#~ msgstr "大约 10 分钟后,输出如下所示:" + +#~ msgid "" +#~ "The result json files are generated " +#~ "into the path `benchmark/results` These " +#~ "files contain detailed benchmarking results" +#~ " for further analysis." +#~ msgstr "结果 json 文件会生成到路径 `benchmark/results`。这些文件包含了用于进一步分析的详细基准测试结果。" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/performance/profile_execute_duration.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/performance/profile_execute_duration.po new file mode 100644 index 0000000..4f1837b --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/performance/profile_execute_duration.po @@ -0,0 +1,86 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/developer_guide/performance/profile_execute_duration.md:1 +msgid "Profile Execute Duration" +msgstr "配置执行持续时间" + +#~ msgid "" +#~ "The execution duration of each stage " +#~ "(including pre/post-processing, model forward," +#~ " etc.) usually needs to be captured" +#~ " during a complete inference process. " +#~ "Typically, this is done by using " +#~ "`torch.xpu.synchronize()` and obtaining CPU " +#~ "timestamps, which increases the performance" +#~ " overhead of host/device synchronization." +#~ msgstr "" +#~ "在完整的推理过程中,通常需要记录每个阶段(包括前/后处理、模型前向等)的执行时长。一般通过使用 " +#~ "`torch.xpu.synchronize()` 并获取 CPU " +#~ "时间戳来实现,这会增加主机/设备同步的性能开销。" + +#~ msgid "" +#~ "**To reduce the performance overhead, we" +#~ " add this feature, using the XPU " +#~ "event timestamp mechanism to observe the" +#~ " device execution time asynchronously.**" +#~ msgstr "**为了减少性能开销,我们添加了此功能,使用 XPU 事件时间戳机制异步观测设备的执行时间。**" + +#~ msgid "Usage" +#~ msgstr "用法" + +#~ msgid "" +#~ "Use the environment variable " +#~ "`VLLM_KUNLUN_MODEL_EXECUTE_TIME_OBSERVE` to enable " +#~ "this feature." +#~ msgstr "使用环境变量 `VLLM_KUNLUN_MODEL_EXECUTE_TIME_OBSERVE` 来启用此功能。" + +#~ msgid "" +#~ "Use the non-blocking API " +#~ "`ProfileExecuteDuration().capture_async` to set " +#~ "observation points asynchronously when you " +#~ "need to observe the execution duration." +#~ msgstr "" +#~ "当你需要观察执行时长时,可以使用非阻塞 API " +#~ "`ProfileExecuteDuration().capture_async` 异步设置观察点。" + +#~ msgid "" +#~ "Use the blocking API " +#~ "`ProfileExecuteDuration().pop_captured_sync` at an " +#~ "appropriate time to get and print " +#~ "the execution durations of all observed" +#~ " stages." +#~ msgstr "" +#~ "在适当的时机使用阻塞式 API " +#~ "`ProfileExecuteDuration().pop_captured_sync` 获取并打印所有已观察到阶段的执行时长。" + +#~ msgid "" +#~ "**We have instrumented the key inference" +#~ " stages (including pre-processing, model" +#~ " forward pass, etc.) for execute " +#~ "duration profiling. Execute the script " +#~ "as follows:**" +#~ msgstr "**我们已经对关键的推理阶段(包括预处理、模型前向传递等)进行了执行时长分析的检测。请按如下方式执行脚本:**" + +#~ msgid "Example Output" +#~ msgstr "示例输出" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/faqs.po b/docs/source/locale/zh_CN/LC_MESSAGES/faqs.po new file mode 100644 index 0000000..16a66a5 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/faqs.po @@ -0,0 +1,507 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/faqs.md:1 +msgid "FAQs" +msgstr "" + +#: ../../source/faqs.md:3 +msgid "Version Specific FAQs" +msgstr "特定版本常见问题" + +#~ msgid "" +#~ "[[v0.7.3.post1] FAQ & Feedback](https://github.com" +#~ "/vllm-project/vllm-kunlun/issues/1007)" +#~ msgstr "" +#~ "[[v0.7.3.post1] 常见问题与反馈](https://github.com/vllm-project" +#~ "/vllm-kunlun/issues/1007)" + +#~ msgid "" +#~ "[[v0.9.2rc1] FAQ & Feedback](https://github.com" +#~ "/vllm-project/vllm-kunlun/issues/1742)" +#~ msgstr "" +#~ "[[v0.9.2rc1] 常见问题与反馈](https://github.com/vllm-project" +#~ "/vllm-kunlun/issues/1742)" + +#~ msgid "General FAQs" +#~ msgstr "常见问题解答" + +#~ msgid "1. What devices are currently supported?" +#~ msgstr "1. 目前支持哪些设备?" + +#~ msgid "" +#~ "Currently, **ONLY** Atlas A2 series(Kunlun-" +#~ "cann-kernels-910b) and Atlas 300I" +#~ "(Kunlun-cann-kernels-310p) series are " +#~ "supported:" +#~ msgstr "" +#~ "目前,**仅**支持 Atlas A2 系列(Kunlun-cann-" +#~ "kernels-910b)和 Atlas 300I(Kunlun-cann-" +#~ "kernels-310p)系列:" + +#~ msgid "" +#~ "Atlas A2 Training series (Atlas 800T " +#~ "A2, Atlas 900 A2 PoD, Atlas 200T" +#~ " A2 Box16, Atlas 300T A2)" +#~ msgstr "" +#~ "Atlas A2 训练系列(Atlas 800T A2,Atlas 900" +#~ " A2 PoD,Atlas 200T A2 Box16,Atlas " +#~ "300T A2)" + +#~ msgid "Atlas 800I A2 Inference series (Atlas 800I A2)" +#~ msgstr "Atlas 800I A2 推理系列(Atlas 800I A2)" + +#~ msgid "Atlas 300I Inference series (Atlas 300I Duo)" +#~ msgstr "Atlas 300I 推理系列(Atlas 300I Duo)" + +#~ msgid "Below series are NOT supported yet:" +#~ msgstr "以下系列目前尚不受支持:" + +#~ msgid "Atlas 200I A2 (Kunlun-cann-kernels-310b) unplanned yet" +#~ msgstr "Atlas 200I A2(Kunlun-cann-kernels-310b)尚未计划" + +#~ msgid "Kunlun 910, Kunlun 910 Pro B (Kunlun-cann-kernels-910) unplanned yet" +#~ msgstr "Kunlun 910,Kunlun 910 Pro B(Kunlun-cann-kernels-910)尚未计划" + +#~ msgid "" +#~ "From a technical view, vllm-kunlun " +#~ "support would be possible if the " +#~ "torch-xpu is supported. Otherwise, we " +#~ "have to implement it by using " +#~ "custom ops. We are also welcome to" +#~ " join us to improve together." +#~ msgstr "" +#~ "从技术角度来看,如果支持 torch-xpu,则可以支持 vllm-" +#~ "kunlun。否则,我们需要通过自定义算子来实现。我们也欢迎大家一起加入,共同改进。" + +#~ msgid "2. How to get our docker containers?" +#~ msgstr "2. 如何获取我们的 docker 容器?" + +#~ msgid "" +#~ "You can get our containers at " +#~ "`Quay.io`, e.g., [vllm-" +#~ "kunlun](https://quay.io/repository/kunlun/vllm-" +#~ "kunlun?tab=tags) and " +#~ "[cann](https://quay.io/repository/kunlun/cann?tab=tags)." +#~ msgstr "" +#~ "你可以在 `Quay.io` 获取我们的容器,例如,[vllm-" +#~ "kunlun](https://quay.io/repository/kunlun/vllm-" +#~ "kunlun?tab=tags) 和 " +#~ "[cann](https://quay.io/repository/kunlun/cann?tab=tags)。" + +#~ msgid "" +#~ "If you are in China, you can " +#~ "use `daocloud` to accelerate your " +#~ "downloading:" +#~ msgstr "如果你在中国,可以使用 `daocloud` 来加速下载:" + +#~ msgid "3. What models does vllm-kunlun supports?" +#~ msgstr "3. vllm-kunlun 支持哪些模型?" + +#~ msgid "" +#~ "Find more details [here](https://vllm-" +#~ "kunlun.readthedocs.io/en/latest/user_guide/support_matrix/supported_models.html)." +#~ msgstr "" +#~ "在[此处](https://vllm-" +#~ "kunlun.readthedocs.io/en/latest/user_guide/support_matrix/supported_models.html)查看更多详细信息。" + +#~ msgid "4. How to get in touch with our community?" +#~ msgstr "4. 如何与我们的社区取得联系?" + +#~ msgid "" +#~ "There are many channels that you " +#~ "can communicate with our community " +#~ "developers / users:" +#~ msgstr "你可以通过多种渠道与我们的社区开发者/用户进行交流:" + +#~ msgid "" +#~ "Submit a GitHub [issue](https://github.com" +#~ "/vllm-project/vllm-kunlun/issues?page=1)." +#~ msgstr "" +#~ "提交一个 GitHub [issue](https://github.com/vllm-" +#~ "project/vllm-kunlun/issues?page=1)。" + +#~ msgid "" +#~ "Join our [weekly " +#~ "meeting](https://docs.google.com/document/d/1hCSzRTMZhIB8vRq1_qOOjx4c9uYUxvdQvDsMV2JcSrw/edit?tab=t.0#heading=h.911qu8j8h35z)" +#~ " and share your ideas." +#~ msgstr "加入我们的[每周会议](https://docs.google.com/document/d/1hCSzRTMZhIB8vRq1_qOOjx4c9uYUxvdQvDsMV2JcSrw/edit?tab=t.0#heading=h.911qu8j8h35z),并分享你的想法。" + +#~ msgid "" +#~ "Join our [WeChat](https://github.com/vllm-" +#~ "project/vllm-kunlun/issues/227) group and ask" +#~ " your quenstions." +#~ msgstr "" +#~ "加入我们的 [微信群](https://github.com/vllm-project" +#~ "/vllm-kunlun/issues/227) 并提问你的问题。" + +#~ msgid "" +#~ "Join our kunlun channel in [vLLM " +#~ "forums](https://discuss.vllm.ai/c/hardware-support/vllm-" +#~ "kunlun-support/6) and publish your " +#~ "topics." +#~ msgstr "" +#~ "加入我们在 [vLLM 论坛](https://discuss.vllm.ai/c" +#~ "/hardware-support/vllm-kunlun-support/6) 的 " +#~ "kunlun 频道并发布你的话题。" + +#~ msgid "5. What features does vllm-kunlun V1 supports?" +#~ msgstr "5. vllm-kunlun V1 支持哪些功能?" + +#~ msgid "" +#~ "Find more details [here](https://vllm-" +#~ "kunlun.readthedocs.io/en/latest/user_guide/support_matrix/supported_features.html)." +#~ msgstr "" +#~ "在[这里](https://vllm-" +#~ "kunlun.readthedocs.io/en/latest/user_guide/support_matrix/supported_features.html)找到更多详细信息。" + +#~ msgid "" +#~ "6. How to solve the problem of " +#~ "\"Failed to infer device type\" or " +#~ "\"libatb.so: cannot open shared object " +#~ "file\"?" +#~ msgstr "6. 如何解决“无法推断设备类型”或“libatb.so:无法打开共享对象文件”问题?" + +#~ msgid "" +#~ "Basically, the reason is that the " +#~ "XPU environment is not configured " +#~ "correctly. You can:" +#~ msgstr "基本上,原因是 XPU 环境没有正确配置。你可以:" + +#~ msgid "" +#~ "try `source /usr/local/Kunlun/nnal/atb/set_env.sh` " +#~ "to enable NNAL package." +#~ msgstr "尝试运行 `source /usr/local/Kunlun/nnal/atb/set_env.sh` 以启用 NNAL 包。" + +#~ msgid "" +#~ "try `source /usr/local/Kunlun/kunlun-" +#~ "toolkit/set_env.sh` to enable CANN package." +#~ msgstr "尝试运行 `source /usr/local/Kunlun/kunlun-toolkit/set_env.sh` 以启用 CANN 包。" + +#~ msgid "try `xpu-smi info` to check whether the XPU is working." +#~ msgstr "尝试运行 `xpu-smi info` 来检查 XPU 是否正常工作。" + +#~ msgid "" +#~ "If all above steps are not " +#~ "working, you can try the following " +#~ "code with python to check whether " +#~ "there is any error:" +#~ msgstr "如果以上所有步骤都无效,你可以尝试使用以下 python 代码来检查是否有错误:" + +#~ msgid "If all above steps are not working, feel free to submit a GitHub issue." +#~ msgstr "如果以上所有步骤都无法解决问题,欢迎提交一个 GitHub issue。" + +#~ msgid "7. How does vllm-kunlun perform?" +#~ msgstr "7. vllm-kunlun 的性能如何?" + +#~ msgid "" +#~ "Currently, only some models are " +#~ "improved. Such as `Qwen2.5 VL`, `Qwen3`," +#~ " `Deepseek V3`. Others are not good" +#~ " enough. From 0.9.0rc2, Qwen and " +#~ "Deepseek works with graph mode to " +#~ "play a good performance. What's more," +#~ " you can install `mindie-turbo` with" +#~ " `vllm-kunlun v0.7.3` to speed up " +#~ "the inference as well." +#~ msgstr "" +#~ "目前,只有部分模型得到了改进,比如 `Qwen2.5 VL`、`Qwen3` 和 " +#~ "`Deepseek V3`。其他模型的效果还不够理想。从 0.9.0rc2 开始,Qwen " +#~ "和 Deepseek 已经支持图模式,以获得更好的性能。此外,你还可以在 `vllm-" +#~ "kunlun v0.7.3` 上安装 `mindie-turbo`,进一步加速推理。" + +#~ msgid "8. How vllm-kunlun work with vllm?" +#~ msgstr "8. vllm-kunlun 如何与 vllm 协同工作?" + +#~ msgid "" +#~ "vllm-kunlun is a plugin for vllm." +#~ " Basically, the version of vllm-" +#~ "kunlun is the same as the version" +#~ " of vllm. For example, if you " +#~ "use vllm 0.7.3, you should use " +#~ "vllm-kunlun 0.7.3 as well. For main" +#~ " branch, we will make sure `vllm-" +#~ "kunlun` and `vllm` are compatible by " +#~ "each commit." +#~ msgstr "" +#~ "vllm-kunlun 是 vllm 的一个插件。基本上,vllm-kunlun" +#~ " 的版本与 vllm 的版本是相同的。例如,如果你使用 vllm " +#~ "0.7.3,你也应该使用 vllm-kunlun 0.7.3。对于主分支,我们会确保每次提交都让 " +#~ "`vllm-kunlun` 和 `vllm` 保持兼容。" + +#~ msgid "9. Does vllm-kunlun support Prefill Disaggregation feature?" +#~ msgstr "9. vllm-kunlun 支持 Prefill Disaggregation 功能吗?" + +#~ msgid "" +#~ "Currently, only 1P1D is supported on " +#~ "V0 Engine. For V1 Engine or NPND" +#~ " support, We will make it stable " +#~ "and supported by vllm-kunlun in " +#~ "the future." +#~ msgstr "目前,V0引擎只支持1P1D。对于V1引擎或NPND的支持,我们将在未来使其稳定并由vllm-kunlun支持。" + +#~ msgid "10. Does vllm-kunlun support quantization method?" +#~ msgstr "10. vllm-kunlun 支持量化方法吗?" + +#~ msgid "" +#~ "Currently, w8a8 quantization is already " +#~ "supported by vllm-kunlun originally on" +#~ " v0.8.4rc2 or higher, If you're using" +#~ " vllm 0.7.3 version, w8a8 quantization " +#~ "is supporeted with the integration of" +#~ " vllm-kunlun and mindie-turbo, please" +#~ " use `pip install vllm-kunlun[mindie-" +#~ "turbo]`." +#~ msgstr "" +#~ "目前,w8a8 量化已在 v0.8.4rc2 或更高版本的 vllm-" +#~ "kunlun 中原生支持。如果你使用的是 vllm 0.7.3 版本,集成了 " +#~ "vllm-kunlun 和 mindie-turbo 后也支持 w8a8" +#~ " 量化,请使用 `pip install vllm-kunlun[mindie-" +#~ "turbo]`。" + +#~ msgid "11. How to run w8a8 DeepSeek model?" +#~ msgstr "11. 如何运行 w8a8 DeepSeek 模型?" + +#~ msgid "" +#~ "Please following the [inferencing " +#~ "tutorail](https://vllm-" +#~ "kunlun.readthedocs.io/en/latest/tutorials/multi_node.html) and" +#~ " replace model to DeepSeek." +#~ msgstr "" +#~ "请按照[inferencing 教程](https://vllm-" +#~ "kunlun.readthedocs.io/en/latest/tutorials/multi_node.html)进行操作,并将模型更换为" +#~ " DeepSeek。" + +#~ msgid "" +#~ "12. There is no output in log " +#~ "when loading models using vllm-kunlun," +#~ " How to solve it?" +#~ msgstr "12. 使用 vllm-kunlun 加载模型时日志没有输出,如何解决?" + +#~ msgid "" +#~ "If you're using vllm 0.7.3 version, " +#~ "this is a known progress bar " +#~ "display issue in VLLM, which has " +#~ "been resolved in [this PR](https://github.com" +#~ "/vllm-project/vllm/pull/12428), please cherry-" +#~ "pick it locally by yourself. Otherwise," +#~ " please fill up an issue." +#~ msgstr "" +#~ "如果你正在使用 vllm 0.7.3 版本,这是 VLLM " +#~ "已知的进度条显示问题,已在 [此 PR](https://github.com/vllm-" +#~ "project/vllm/pull/12428) 中解决,请自行在本地进行 cherry-" +#~ "pick。否则,请提交一个 issue。" + +#~ msgid "13. How vllm-kunlun is tested" +#~ msgstr "13. 如何测试 vllm-kunlun" + +#~ msgid "" +#~ "vllm-kunlun is tested by functional " +#~ "test, performance test and accuracy " +#~ "test." +#~ msgstr "vllm-kunlun 经过功能测试、性能测试和精度测试。" + +#~ msgid "" +#~ "**Functional test**: we added CI, " +#~ "includes portion of vllm's native unit" +#~ " tests and vllm-kunlun's own unit " +#~ "tests,on vllm-kunlun's test, we test " +#~ "basic functionality、popular models availability " +#~ "and [supported features](https://vllm-" +#~ "kunlun.readthedocs.io/en/latest/user_guide/support_matrix/supported_features.html)" +#~ " via e2e test" +#~ msgstr "" +#~ "**功能测试**:我们添加了CI,包含了vllm原生单元测试的一部分以及vllm-kunlun自己的单元测试。在vllm-" +#~ "kunlun的测试中,我们通过e2e测试验证了基本功能、主流模型可用性和[支持的特性](https://vllm-" +#~ "kunlun.readthedocs.io/en/latest/user_guide/support_matrix/supported_features.html)。" + +#~ msgid "" +#~ "**Performance test**: we provide " +#~ "[benchmark](https://github.com/vllm-project/vllm-" +#~ "kunlun/tree/main/benchmarks) tools for end-" +#~ "to-end performance benchmark which can " +#~ "easily to re-route locally, we'll " +#~ "publish a perf website to show the" +#~ " performance test results for each " +#~ "pull request" +#~ msgstr "" +#~ "**性能测试**:我们提供了用于端到端性能基准测试的[基准测试](https://github.com/vllm-project" +#~ "/vllm-" +#~ "kunlun/tree/main/benchmarks)工具,可以方便地在本地重新运行。我们将发布一个性能网站,用于展示每个拉取请求的性能测试结果。" + +#~ msgid "**Accuracy test**: we're working on adding accuracy test to CI as well." +#~ msgstr "**准确性测试**:我们也在努力将准确性测试添加到CI中。" + +#~ msgid "" +#~ "Finnall, for each release, we'll publish" +#~ " the performance test and accuracy " +#~ "test report in the future." +#~ msgstr "最后,未来每个版本发布时,我们都会公开性能测试和准确性测试报告。" + +#~ msgid "14. How to fix the error \"InvalidVersion\" when using vllm-kunlun?" +#~ msgstr "14. 使用 vllm-kunlun 时如何解决 “InvalidVersion” 错误?" + +#~ msgid "" +#~ "It's usually because you have installed" +#~ " an dev/editable version of vLLM " +#~ "package. In this case, we provide " +#~ "the env variable `VLLM_VERSION` to let" +#~ " users specify the version of vLLM" +#~ " package to use. Please set the " +#~ "env variable `VLLM_VERSION` to the " +#~ "version of vLLM package you have " +#~ "installed. The format of `VLLM_VERSION` " +#~ "should be `X.Y.Z`." +#~ msgstr "" +#~ "这通常是因为你安装了开发版或可编辑版本的 vLLM 包。在这种情况下,我们提供了环境变量 " +#~ "`VLLM_VERSION`,以便用户指定要使用的 vLLM 包版本。请将环境变量 " +#~ "`VLLM_VERSION` 设置为你已安装的 vLLM 包的版本。`VLLM_VERSION` " +#~ "的格式应为 `X.Y.Z`。" + +#~ msgid "15. How to handle Out Of Memory?" +#~ msgstr "15. 如何处理内存溢出?" + +#~ msgid "" +#~ "OOM errors typically occur when the " +#~ "model exceeds the memory capacity of " +#~ "a single XPU. For general guidance, " +#~ "you can refer to [vLLM's OOM " +#~ "troubleshooting " +#~ "documentation](https://docs.vllm.ai/en/latest/getting_started/troubleshooting.html" +#~ "#out-of-memory)." +#~ msgstr "" +#~ "当模型超出单个 XPU 的内存容量时,通常会发生 OOM(内存溢出)错误。一般性的指导可以参考 " +#~ "[vLLM 的 OOM " +#~ "故障排除文档](https://docs.vllm.ai/en/latest/getting_started/troubleshooting.html" +#~ "#out-of-memory)。" + +#~ msgid "" +#~ "In scenarios where XPUs have limited " +#~ "HBM (High Bandwidth Memory) capacity, " +#~ "dynamic memory allocation/deallocation during " +#~ "inference can exacerbate memory fragmentation," +#~ " leading to OOM. To address this:" +#~ msgstr "" +#~ "在 XPU 的 " +#~ "HBM(高带宽内存)容量有限的场景下,推理过程中动态内存分配和释放会加剧内存碎片,从而导致 " +#~ "OOM(内存溢出)。为了解决这个问题:" + +#~ msgid "" +#~ "**Adjust `--gpu-memory-utilization`**: If " +#~ "unspecified, will use the default value" +#~ " of `0.9`. You can decrease this " +#~ "param to reserve more memory to " +#~ "reduce fragmentation risks. See more " +#~ "note in: [vLLM - Inference and " +#~ "Serving - Engine " +#~ "Arguments](https://docs.vllm.ai/en/latest/serving/engine_args.html#vllm.engine" +#~ ".arg_utils-_engine_args_parser-cacheconfig)." +#~ msgstr "" +#~ "**调整 `--gpu-memory-utilization`**:如果未指定,将使用默认值 " +#~ "`0.9`。你可以降低此参数来预留更多内存,从而降低内存碎片风险。参见更多说明:[vLLM - 推理与服务 " +#~ "- " +#~ "引擎参数](https://docs.vllm.ai/en/latest/serving/engine_args.html#vllm.engine" +#~ ".arg_utils-_engine_args_parser-cacheconfig)。" + +#~ msgid "" +#~ "**Configure `PYTORCH_XPU_ALLOC_CONF`**: Set this " +#~ "environment variable to optimize XPU " +#~ "memory management. For example, you can" +#~ " `export PYTORCH_XPU_ALLOC_CONF=expandable_segments:True` " +#~ "to enable virtual memory feature to " +#~ "mitigate memory fragmentation caused by " +#~ "frequent dynamic memory size adjustments " +#~ "during runtime, see more note in: " +#~ "[PYTORCH_XPU_ALLOC_CONF](https://www.hikunlun.com/document/detail/zh/Pytorch/700/comref/Envvariables/Envir_012.html)." +#~ msgstr "" +#~ "**配置 `PYTORCH_XPU_ALLOC_CONF`**:设置此环境变量以优化XPU内存管理。例如,你可以通过 " +#~ "`export PYTORCH_XPU_ALLOC_CONF=expandable_segments:True` " +#~ "来启用虚拟内存功能,以缓解运行时频繁动态调整内存大小导致的内存碎片问题,更多说明参见:[PYTORCH_XPU_ALLOC_CONF](https://www.hikunlun.com/document/detail/zh/Pytorch/700/comref/Envvariables/Envir_012.html)。" + +#~ msgid "16. Failed to enable XPU graph mode when running DeepSeek?" +#~ msgstr "16. 运行 DeepSeek 时无法启用 XPU 图模式?" + +#~ msgid "" +#~ "You may encounter the following error" +#~ " if running DeepSeek with XPU graph" +#~ " mode enabled. The allowed number of" +#~ " queries per kv when enabling both" +#~ " MLA and Graph mode only support " +#~ "{32, 64, 128}, **Thus this is not" +#~ " supported for DeepSeek-V2-Lite**, as it" +#~ " only has 16 attention heads. The " +#~ "XPU graph mode support on " +#~ "DeepSeek-V2-Lite will be done in the " +#~ "future." +#~ msgstr "" +#~ "如果在启用XPU图模式(Graph " +#~ "mode)运行DeepSeek时,您可能会遇到以下错误。当同时启用MLA和图模式时,每个kv允许的查询数只支持{32, 64," +#~ " " +#~ "128},**因此这不支持DeepSeek-V2-Lite**,因为它只有16个注意力头。未来会增加对DeepSeek-V2-Lite在XPU图模式下的支持。" + +#~ msgid "" +#~ "And if you're using DeepSeek-V3 or " +#~ "DeepSeek-R1, please make sure after the" +#~ " tensor parallel split, num_heads / " +#~ "num_kv_heads in {32, 64, 128}." +#~ msgstr "" +#~ "如果你正在使用 DeepSeek-V3 或 " +#~ "DeepSeek-R1,请确保在张量并行切分后,num_heads / num_kv_heads 的值为" +#~ " {32, 64, 128} 中的一个。" + +#~ msgid "" +#~ "17. Failed to reinstall vllm-kunlun " +#~ "from source after uninstalling vllm-" +#~ "kunlun?" +#~ msgstr "17. 卸载 vllm-kunlun 后无法从源码重新安装 vllm-kunlun?" + +#~ msgid "" +#~ "You may encounter the problem of C" +#~ " compilation failure when reinstalling " +#~ "vllm-kunlun from source using pip. If" +#~ " the installation fails, it is " +#~ "recommended to use `python setup.py " +#~ "install` to install, or use `python " +#~ "setup.py clean` to clear the cache." +#~ msgstr "" +#~ "当你使用 pip 从源码重新安装 vllm-kunlun 时,可能会遇到 " +#~ "C 编译失败的问题。如果安装失败,建议使用 `python setup.py " +#~ "install` 进行安装,或者使用 `python setup.py clean` " +#~ "清除缓存。" + +#~ msgid "18. How to generate determinitic results when using vllm-kunlun?" +#~ msgstr "18. 使用 vllm-kunlun 时如何生成确定性结果?" + +#~ msgid "There are several factors that affect output certainty:" +#~ msgstr "有几个因素会影响输出的确定性:" + +#~ msgid "" +#~ "Sampler Method: using **Greedy sample** " +#~ "by setting `temperature=0` in " +#~ "`SamplingParams`, e.g.:" +#~ msgstr "" +#~ "采样方法:通过在 `SamplingParams` 中设置 `temperature=0` " +#~ "来使用 **贪婪采样(Greedy sample)**,例如:" + +#~ msgid "Set the following enveriments parameters:" +#~ msgstr "设置以下环境参数:" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/index.po new file mode 100644 index 0000000..39e5d18 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/index.po @@ -0,0 +1,78 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 17:48+0800\n" +"PO-Revision-Date: 2025-07-18 10:05+0800\n" +"Last-Translator: \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/index.md:34 +msgid "Getting Started" +msgstr "快速开始" + +#: ../../source/index.md:44 +msgid "User Guide" +msgstr "用户指南" + +#: ../../source/index.md:54 +msgid "Developer Guide" +msgstr "开发者指南" + +#: ../../source/index.md:64 +msgid "Community" +msgstr "社区" + +#: ../../source/index.md:1 +msgid "Welcome to vLLM Kunlun Plugin" +msgstr "欢迎使用 vLLM Kunlun 插件" + +#: ../../source/index.md:3 +msgid "vLLM" +msgstr "vLLM" + +#: ../../source/index.md:25 +msgid "" +"vLLM Kunlun (vllm-kunlun) is a community-maintained hardware plugin " +"designed to seamlessly run vLLM on the Kunlun XPU. It is the recommended " +"approach for integrating the Kunlun backend within the vLLM community, " +"adhering to the principles outlined in the [[RFC]: Hardware " +"pluggable](https://github.com/vllm-project/vllm/issues/11162). This " +"plugin provides a hardware-pluggable interface that decouples the " +"integration of the Kunlun XPU with vLLM." +msgstr "vLLM Kunlun(vllm-kunlun)是一个由社区维护的硬件插件,旨在无缝地在昆仑 XPU 上运行 vLLM。它是将昆仑后端集成到 vLLM 社区的推荐方法,遵循 [[RFC]:硬件可插拔](https://github.com/vllm-project/vllm/issues/11162) 中提出的原则,提供了一个硬件可插拔接口,实现了昆仑 XPU 与 vLLM 集成的解耦。" + + +#: ../../source/index.md:27 +msgid "" +"By utilizing the vLLM Kunlun plugin, popular open-source models, " +"including Transformer-like, Mixture-of-Expert, Embedding, and Multi-modal" +" LLMs, can run effortlessly on the Kunlun XPU." +msgstr "" +"通过使用 vLLM Kunlun 插件,流行的开源模型,包括 Transformer 类、混合专家、嵌入式、多模态大模型等,都可以在 Kunlun" +" XPU 上无缝运行。" + +#: ../../source/index.md:31 +msgid "Documentation" +msgstr "文档" + +#~ msgid "" +#~ "vLLM Kunlun plugin (vllm-kunlun) is " +#~ "a community maintained hardware plugin " +#~ "for running vLLM on the Kunlun " +#~ "XPU." +#~ msgstr "vLLM Kunlun 插件(vllm-kunlun)是一个由社区维护的硬件插件,用于在 Kunlun XPU 上运行 vLLM。" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/installation.po b/docs/source/locale/zh_CN/LC_MESSAGES/installation.po new file mode 100644 index 0000000..d0a7e6b --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/installation.po @@ -0,0 +1,260 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: 2025-07-18 10:09+0800\n" +"Last-Translator: \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/installation.md:1 +msgid "Installation" +msgstr "安装" + +#~ msgid "This document describes how to install vllm-kunlun manually." +#~ msgstr "本文档介绍如何手动安装 vllm-kunlun。" + +#~ msgid "Requirements" +#~ msgstr "要求" + +#~ msgid "OS: Linux" +#~ msgstr "操作系统:Linux" + +#~ msgid "Python: >= 3.9, < 3.12" +#~ msgstr "Python:>= 3.9,< 3.12" + +#~ msgid "A hardware with Kunlun XPU. It's usually the Atlas 800 A2 series." +#~ msgstr "配备有昇腾XPU的硬件,通常是Atlas 800 A2系列。" + +#~ msgid "Software:" +#~ msgstr "软件:" + +#~ msgid "Software" +#~ msgstr "软件" + +#~ msgid "Supported version" +#~ msgstr "支持的版本" + +#~ msgid "Note" +#~ msgstr "注释" + +#~ msgid "CANN" +#~ msgstr "CANN" + +#~ msgid ">= 8.1.RC1" +#~ msgstr ">= 8.1.RC1" + +#~ msgid "Required for vllm-kunlun and torch-xpu" +#~ msgstr "vllm-kunlun 和 torch-xpu 必需" + +#~ msgid "torch-xpu" +#~ msgstr "torch-xpu" + +#~ msgid ">= 2.5.1.post1.dev20250619" +#~ msgstr ">= 2.5.1.post1.dev20250619" + +#~ msgid "" +#~ "Required for vllm-kunlun, No need " +#~ "to install manually, it will be " +#~ "auto installed in below steps" +#~ msgstr "vllm-kunlun 必需,无需手动安装,后续步骤会自动安装。" + +#~ msgid "torch" +#~ msgstr "torch" + +#~ msgid ">= 2.5.1" +#~ msgstr ">= 2.5.1" + +#~ msgid "Required for torch-xpu and vllm" +#~ msgstr "torch-xpu 和 vllm 所需" + +#~ msgid "You have 2 way to install:" +#~ msgstr "你有两种安装方式:" + +#~ msgid "" +#~ "**Using pip**: first prepare env " +#~ "manually or via CANN image, then " +#~ "install `vllm-kunlun` using pip." +#~ msgstr "**使用 pip**:首先手动准备环境或通过 CANN 镜像准备环境,然后使用 pip 安装 `vllm-kunlun`。" + +#~ msgid "" +#~ "**Using docker**: use the `vllm-kunlun`" +#~ " pre-built docker image directly." +#~ msgstr "**使用 docker**:直接使用 `vllm-kunlun` 预构建的 docker 镜像。" + +#~ msgid "Configure a new environment" +#~ msgstr "配置一个新环境" + +#~ msgid "" +#~ "Before installing, you need to make " +#~ "sure firmware/driver and CANN are " +#~ "installed correctly, refer to " +#~ "[link](https://kunlun.github.io/docs/sources/kunlun/quick_install.html)" +#~ " for more details." +#~ msgstr "" +#~ "在安装之前,您需要确保固件/驱动和 CANN 已正确安装,更多详情请参考 " +#~ "[链接](https://kunlun.github.io/docs/sources/kunlun/quick_install.html)。" + +#~ msgid "Configure hardware environment" +#~ msgstr "配置硬件环境" + +#~ msgid "" +#~ "To verify that the Kunlun XPU " +#~ "firmware and driver were correctly " +#~ "installed, run:" +#~ msgstr "要验证 Kunlun XPU 固件和驱动程序是否正确安装,请运行:" + +#~ msgid "" +#~ "Refer to [Kunlun Environment Setup " +#~ "Guide](https://kunlun.github.io/docs/sources/kunlun/quick_install.html)" +#~ " for more details." +#~ msgstr "更多详情请参考[Kunlun环境搭建指南](https://kunlun.github.io/docs/sources/kunlun/quick_install.html)。" + +#~ msgid "Configure software environment" +#~ msgstr "配置软件环境" + +#~ msgid "Before using pip" +#~ msgstr "在使用 pip 之前" + +#~ msgid "" +#~ "The easiest way to prepare your " +#~ "software environment is using CANN image" +#~ " directly:" +#~ msgstr "最简单的方式是直接使用 CANN 镜像来准备您的软件环境:" + +#~ msgid "Click here to see \"Install CANN manually\"" +#~ msgstr "点击此处查看“手动安装 CANN”" + +#~ msgid "You can also install CANN manually:" +#~ msgstr "你也可以手动安装 CANN:" + +#~ msgid "Before using docker" +#~ msgstr "在使用 docker 之前" + +#~ msgid "" +#~ "No more extra step if you are " +#~ "using `vllm-kunlun` prebuilt docker " +#~ "image." +#~ msgstr "如果你使用 `vllm-kunlun` 预构建的 docker 镜像,就无需额外的步骤。" + +#~ msgid "Once it's done, you can start to set up `vllm` and `vllm-kunlun`." +#~ msgstr "完成后,你可以开始配置 `vllm` 和 `vllm-kunlun`。" + +#~ msgid "Setup vllm and vllm-kunlun" +#~ msgstr "安装 vllm 和 vllm-kunlun" + +#~ msgid "Using pip" +#~ msgstr "使用 pip" + +#~ msgid "First install system dependencies and config pip mirror:" +#~ msgstr "首先安装系统依赖并配置 pip 镜像:" + +#~ msgid "" +#~ "**[Optional]** Then config the extra-" +#~ "index of `pip` if you are working" +#~ " on a x86 machine or using " +#~ "torch-xpu dev version:" +#~ msgstr "**[可选]** 如果你在 x86 机器上工作或使用 torch-xpu 开发版,请配置 `pip` 的额外索引:" + +#~ msgid "Then you can install `vllm` and `vllm-kunlun` from **pre-built wheel**:" +#~ msgstr "然后你可以从**预编译的 wheel 包**安装 `vllm` 和 `vllm-kunlun`:" + +#~ msgid "Click here to see \"Build from source code\"" +#~ msgstr "点击此处查看“从源代码构建”" + +#~ msgid "or build from **source code**:" +#~ msgstr "或者从**源代码**构建:" + +#~ msgid "" +#~ "vllm-kunlun will build custom ops " +#~ "by default. If you don't want to" +#~ " build it, set `COMPILE_CUSTOM_KERNELS=0` " +#~ "environment to disable it." +#~ msgstr "" +#~ "vllm-kunlun 默认会编译自定义算子。如果你不想编译它,可以设置环境变量 " +#~ "`COMPILE_CUSTOM_KERNELS=0` 来禁用。" + +#~ msgid "" +#~ "If you are building from v0.7.3-dev " +#~ "and intend to use sleep mode " +#~ "feature, you should set " +#~ "`COMPILE_CUSTOM_KERNELS=1` manually. To build " +#~ "custom ops, gcc/g++ higher than 8 " +#~ "and c++ 17 or higher is required." +#~ " If you're using `pip install -e " +#~ ".` and encourage a torch-xpu " +#~ "version conflict, please install with " +#~ "`pip install --no-build-isolation -e " +#~ ".` to build on system env. If " +#~ "you encounter other problems during " +#~ "compiling, it is probably because " +#~ "unexpected compiler is being used, you" +#~ " may export `CXX_COMPILER` and `C_COMPILER`" +#~ " in env to specify your g++ and" +#~ " gcc locations before compiling." +#~ msgstr "" +#~ "如果你是从 v0.7.3-dev 版本开始构建,并且打算使用休眠模式功能,你需要手动设置 " +#~ "`COMPILE_CUSTOM_KERNELS=1`。构建自定义算子时,要求 gcc/g++ 版本高于 " +#~ "8 且支持 c++ 17 或更高标准。如果你正在使用 `pip " +#~ "install -e .` 并且出现了 torch-xpu " +#~ "版本冲突,请使用 `pip install --no-build-" +#~ "isolation -e .` " +#~ "在系统环境下进行安装。如果在编译过程中遇到其它问题,可能是因为使用了非预期的编译器,你可以在编译前通过环境变量导出 " +#~ "`CXX_COMPILER` 和 `C_COMPILER`,以指定你的 g++ 和 " +#~ "gcc 路径。" + +#~ msgid "Using docker" +#~ msgstr "使用 docker" + +#~ msgid "You can just pull the **prebuilt image** and run it with bash." +#~ msgstr "你可以直接拉取**预构建镜像**并用 bash 运行它。" + +#~ msgid "Click here to see \"Build from Dockerfile\"" +#~ msgstr "点击这里查看“从 Dockerfile 构建”" + +#~ msgid "or build IMAGE from **source code**:" +#~ msgstr "或从**源代码**构建 IMAGE:" + +#~ msgid "" +#~ "The default workdir is `/workspace`, " +#~ "vLLM and vLLM Kunlun code are " +#~ "placed in `/vllm-workspace` and " +#~ "installed in [development " +#~ "mode](https://setuptools.pypa.io/en/latest/userguide/development_mode.html)(`pip" +#~ " install -e`) to help developer " +#~ "immediately take place changes without " +#~ "requiring a new installation." +#~ msgstr "" +#~ "默认的工作目录是 `/workspace`,vLLM 和 vLLM Kunlun " +#~ "代码被放置在 `/vllm-" +#~ "workspace`,并以[开发模式](https://setuptools.pypa.io/en/latest/userguide/development_mode.html)(`pip" +#~ " install -e`)安装,以便开发者能够即时生效更改,而无需重新安装。" + +#~ msgid "Extra information" +#~ msgstr "额外信息" + +#~ msgid "Verify installation" +#~ msgstr "验证安装" + +#~ msgid "Create and run a simple inference test. The `example.py` can be like:" +#~ msgstr "创建并运行一个简单的推理测试。`example.py` 可以如下:" + +#~ msgid "Then run:" +#~ msgstr "然后运行:" + +#~ msgid "The output will be like:" +#~ msgstr "输出将会像这样:" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/quick_start.po b/docs/source/locale/zh_CN/LC_MESSAGES/quick_start.po new file mode 100644 index 0000000..e87ffac --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/quick_start.po @@ -0,0 +1,139 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: 2025-07-18 10:09+0800\n" +"Last-Translator: \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/quick_start.md:1 +msgid "Quickstart" +msgstr "快速入门" + +#: ../../source/quick_start.md:3 +msgid "Prerequisites" +msgstr "先决条件" + +#: ../../source/quick_start.md:5 +msgid "Supported Devices" +msgstr "支持的设备" + +#~ msgid "" +#~ "Atlas A2 Training series (Atlas 800T " +#~ "A2, Atlas 900 A2 PoD, Atlas 200T" +#~ " A2 Box16, Atlas 300T A2)" +#~ msgstr "" +#~ "Atlas A2 训练系列(Atlas 800T A2,Atlas 900" +#~ " A2 PoD,Atlas 200T A2 Box16,Atlas " +#~ "300T A2)" + +#~ msgid "Atlas 800I A2 Inference series (Atlas 800I A2)" +#~ msgstr "Atlas 800I A2 推理系列(Atlas 800I A2)" + +#~ msgid "Setup environment using container" +#~ msgstr "使用容器设置环境" + +#~ msgid "Ubuntu" +#~ msgstr "Ubuntu" + +#~ msgid "openEuler" +#~ msgstr "openEuler" + +#~ msgid "" +#~ "The default workdir is `/workspace`, " +#~ "vLLM and vLLM Kunlun code are " +#~ "placed in `/vllm-workspace` and " +#~ "installed in [development " +#~ "mode](https://setuptools.pypa.io/en/latest/userguide/development_mode.html)(`pip" +#~ " install -e`) to help developer " +#~ "immediately take place changes without " +#~ "requiring a new installation." +#~ msgstr "" +#~ "默认的工作目录是 `/workspace`,vLLM 和 vLLM Kunlun " +#~ "代码被放置在 `/vllm-" +#~ "workspace`,并以[开发模式](https://setuptools.pypa.io/en/latest/userguide/development_mode.html)(`pip" +#~ " install -e`)安装,以便开发者能够即时生效更改,而无需重新安装。" + +#~ msgid "Usage" +#~ msgstr "用法" + +#~ msgid "You can use Modelscope mirror to speed up download:" +#~ msgstr "你可以使用 Modelscope 镜像来加速下载:" + +#~ msgid "There are two ways to start vLLM on Kunlun XPU:" +#~ msgstr "在昇腾 XPU 上启动 vLLM 有两种方式:" + +#~ msgid "Offline Batched Inference" +#~ msgstr "离线批量推理" + +#~ msgid "" +#~ "With vLLM installed, you can start " +#~ "generating texts for list of input " +#~ "prompts (i.e. offline batch inferencing)." +#~ msgstr "安装了 vLLM 后,您可以开始为一系列输入提示生成文本(即离线批量推理)。" + +#~ msgid "" +#~ "Try to run below Python script " +#~ "directly or use `python3` shell to " +#~ "generate texts:" +#~ msgstr "尝试直接运行下面的 Python 脚本,或者使用 `python3` 交互式命令行来生成文本:" + +#~ msgid "OpenAI Completions API" +#~ msgstr "OpenAI Completions API" + +#~ msgid "" +#~ "vLLM can also be deployed as a " +#~ "server that implements the OpenAI API" +#~ " protocol. Run the following command " +#~ "to start the vLLM server with the" +#~ " [Qwen/Qwen2.5-0.5B-" +#~ "Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) " +#~ "model:" +#~ msgstr "" +#~ "vLLM 也可以作为实现 OpenAI API 协议的服务器进行部署。运行以下命令,使用" +#~ " [Qwen/Qwen2.5-0.5B-" +#~ "Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) " +#~ "模型启动 vLLM 服务器:" + +#~ msgid "If you see log as below:" +#~ msgstr "如果你看到如下日志:" + +#~ msgid "Congratulations, you have successfully started the vLLM server!" +#~ msgstr "恭喜,你已经成功启动了 vLLM 服务器!" + +#~ msgid "You can query the list the models:" +#~ msgstr "你可以查询模型列表:" + +#~ msgid "You can also query the model with input prompts:" +#~ msgstr "你也可以通过输入提示来查询模型:" + +#~ msgid "" +#~ "vLLM is serving as background process," +#~ " you can use `kill -2 $VLLM_PID` " +#~ "to stop the background process " +#~ "gracefully, it's equal to `Ctrl-C` to" +#~ " stop foreground vLLM process:" +#~ msgstr "" +#~ "vLLM 正作为后台进程运行,你可以使用 `kill -2 $VLLM_PID` " +#~ "来优雅地停止后台进程,这等同于使用 `Ctrl-C` 停止前台 vLLM 进程:" + +#~ msgid "You will see output as below:" +#~ msgstr "你将会看到如下输出:" + +#~ msgid "Finally, you can exit container by using `ctrl-D`." +#~ msgstr "最后,你可以通过按 `ctrl-D` 退出容器。" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/DeepSeek-V3.2-Exp.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/DeepSeek-V3.2-Exp.po new file mode 100644 index 0000000..7e85e79 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/DeepSeek-V3.2-Exp.po @@ -0,0 +1,30 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun \n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/tutorials/DeepSeek-V3.2-Exp.md:1 +msgid "DeepSeek-V3.2-Exp" +msgstr "" + +#: ../../source/tutorials/DeepSeek-V3.2-Exp.md:3 +msgid "Introduction" +msgstr "" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/index.po new file mode 100644 index 0000000..c4cf846 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/index.po @@ -0,0 +1,29 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../tutorials/index.md:3 +msgid "Deployment" +msgstr "部署" + +#: ../../tutorials/index.md:1 +msgid "Tutorials" +msgstr "教程" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_node.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_node.po new file mode 100644 index 0000000..c1a6be0 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_node.po @@ -0,0 +1,213 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/tutorials/multi_node.md:1 +msgid "Multi-Node-DP (DeepSeek)" +msgstr "多节点分布式处理(DeepSeek)" + +#: ../../source/tutorials/multi_node.md:3 +msgid "Getting Start" +msgstr "快速开始" + +#~ msgid "" +#~ "vLLM-Kunlun now supports Data Parallel" +#~ " (DP) deployment, enabling model weights" +#~ " to be replicated across multiple " +#~ "XPUs or instances, each processing " +#~ "independent batches of requests. This is" +#~ " particularly useful for scaling throughput" +#~ " across devices while maintaining high " +#~ "resource utilization." +#~ msgstr "" +#~ "vLLM-Kunlun 现在支持数据并行(DP)部署,可以在多个 XPU " +#~ "或实例之间复制模型权重,每个实例处理独立的请求批次。这对于在保证高资源利用率的同时,实现跨设备的吞吐量扩展特别有用。" + +#~ msgid "" +#~ "Each DP rank is deployed as a " +#~ "separate “core engine” process which " +#~ "communicates with front-end process(es) " +#~ "via ZMQ sockets. Data Parallel can " +#~ "be combined with Tensor Parallel, in " +#~ "which case each DP engine owns a" +#~ " number of per-XPU worker processes" +#~ " equal to the TP size." +#~ msgstr "" +#~ "每个 DP 进程作为一个单独的“核心引擎”进程部署,并通过 ZMQ " +#~ "套接字与前端进程通信。数据并行可以与张量并行结合使用,此时每个 DP 引擎拥有数量等于 TP " +#~ "大小的每 XPU 工作进程。" + +#~ msgid "" +#~ "For Mixture-of-Experts (MoE) models " +#~ "— especially advanced architectures like " +#~ "DeepSeek that utilize Multi-head Latent" +#~ " Attention (MLA) — a hybrid " +#~ "parallelism approach is recommended: - " +#~ "Use **Data Parallelism (DP)** for " +#~ "attention layers, which are replicated " +#~ "across devices and handle separate " +#~ "batches. - Use **Expert or Tensor" +#~ " Parallelism (EP/TP)** for expert layers," +#~ " which are sharded across devices to" +#~ " distribute the computation." +#~ msgstr "" +#~ "对于混合专家(Mixture-of-Experts, MoE)模型——尤其是像 " +#~ "DeepSeek 这样采用多头潜在注意力(Multi-head Latent " +#~ "Attention, MLA)的高级架构——推荐使用混合并行策略:\n" +#~ " - 对于注意力层,使用 **数据并行(Data Parallelism, DP)**,这些层会在各设备间复刻,并处理不同的批次。\n" +#~ " - 对于专家层,使用 **专家并行或张量并行(Expert or " +#~ "Tensor Parallelism, EP/TP)**,这些层会在设备间分片,从而分担计算。" + +#~ msgid "" +#~ "This division enables attention layers " +#~ "to be replicated across Data Parallel" +#~ " (DP) ranks, enabling them to process" +#~ " different batches independently. Meanwhile, " +#~ "expert layers are partitioned (sharded) " +#~ "across devices using Expert or Tensor" +#~ " Parallelism(DP*TP), maximizing hardware " +#~ "utilization and efficiency." +#~ msgstr "这种划分使得注意力层能够在数据并行(DP)组内复制,从而能够独立处理不同的批次。同时,专家层通过专家或张量并行(DP*TP)在设备间进行分区(切片),最大化硬件利用率和效率。" + +#~ msgid "" +#~ "In these cases the data parallel " +#~ "ranks are not completely independent, " +#~ "forward passes must be aligned and " +#~ "expert layers across all ranks are " +#~ "required to synchronize during every " +#~ "forward pass, even if there are " +#~ "fewer requests to be processed than " +#~ "DP ranks." +#~ msgstr "" +#~ "在这些情况下,数据并行的各个 rank 不是完全独立的,前向传播必须对齐,并且所有 rank " +#~ "上的专家层在每次前向传播时都需要同步,即使待处理的请求数量少于 DP rank 的数量。" + +#~ msgid "" +#~ "For MoE models, when any requests " +#~ "are in progress in any rank, we" +#~ " must ensure that empty “dummy” " +#~ "forward passes are performed in all " +#~ "ranks which don’t currently have any " +#~ "requests scheduled. This is handled via" +#~ " a separate DP `Coordinator` process " +#~ "which communicates with all of the " +#~ "ranks, and a collective operation " +#~ "performed every N steps to determine " +#~ "when all ranks become idle and can" +#~ " be paused. When TP is used in" +#~ " conjunction with DP, expert layers " +#~ "form an EP or TP group of " +#~ "size (DP x TP)." +#~ msgstr "" +#~ "对于 MoE 模型,当任何一个 rank 有请求正在进行时,必须确保所有当前没有请求的" +#~ " rank 都执行空的“虚拟”前向传播。这是通过一个单独的 DP `Coordinator`" +#~ " 协调器进程来实现的,该进程与所有 rank 通信,并且每隔 N " +#~ "步执行一次集体操作,以判断所有 rank 是否都处于空闲状态并可以暂停。当 TP 与 " +#~ "DP 结合使用时,专家层会组成一个规模为(DP x TP)的 EP 或 " +#~ "TP 组。" + +#~ msgid "Verify Multi-Node Communication Environment" +#~ msgstr "验证多节点通信环境" + +#~ msgid "Physical Layer Requirements:" +#~ msgstr "物理层要求:" + +#~ msgid "" +#~ "The physical machines must be located" +#~ " on the same WLAN, with network " +#~ "connectivity." +#~ msgstr "物理机器必须位于同一个 WLAN 中,并且具有网络连接。" + +#~ msgid "" +#~ "All XPUs are connected with optical " +#~ "modules, and the connection status must" +#~ " be normal." +#~ msgstr "所有 XPU 都通过光模块连接,且连接状态必须正常。" + +#~ msgid "Verification Process:" +#~ msgstr "验证流程:" + +#~ msgid "" +#~ "Execute the following commands on each" +#~ " node in sequence. The results must" +#~ " all be `success` and the status " +#~ "must be `UP`:" +#~ msgstr "在每个节点上依次执行以下命令。所有结果必须为 `success` 且状态必须为 `UP`:" + +#~ msgid "XPU Interconnect Verification:" +#~ msgstr "XPU 互连验证:" + +#~ msgid "1. Get XPU IP Addresses" +#~ msgstr "1. 获取 XPU IP 地址" + +#~ msgid "2. Cross-Node PING Test" +#~ msgstr "2. 跨节点PING测试" + +#~ msgid "Run with docker" +#~ msgstr "用 docker 运行" + +#~ msgid "" +#~ "Assume you have two Atlas 800 " +#~ "A2(64G*8) nodes, and want to deploy " +#~ "the `deepseek-v3-w8a8` quantitative model " +#~ "across multi-node." +#~ msgstr "假设你有两台 Atlas 800 A2(64G*8)节点,并且想要在多节点上部署 `deepseek-v3-w8a8` 量化模型。" + +#~ msgid "" +#~ "Before launch the inference server, " +#~ "ensure some environment variables are " +#~ "set for multi node communication" +#~ msgstr "在启动推理服务器之前,确保已经为多节点通信设置了一些环境变量。" + +#~ msgid "Run the following scripts on two nodes respectively" +#~ msgstr "分别在两台节点上运行以下脚本" + +#~ msgid "**node0**" +#~ msgstr "**节点0**" + +#~ msgid "**node1**" +#~ msgstr "**节点1**" + +#~ msgid "" +#~ "The Deployment view looks like: ![alt" +#~ " text](../assets/multi_node_dp.png)" +#~ msgstr "部署视图如下所示:![替代文本](../assets/multi_node_dp.png)" + +#~ msgid "alt text" +#~ msgstr "替代文本" + +#~ msgid "" +#~ "Once your server is started, you " +#~ "can query the model with input " +#~ "prompts:" +#~ msgstr "一旦你的服务器启动,你可以通过输入提示词来查询模型:" + +#~ msgid "Run benchmarks" +#~ msgstr "运行基准测试" + +#~ msgid "" +#~ "For details please refer to " +#~ "[benchmark](https://github.com/vllm-project/vllm-" +#~ "kunlun/tree/main/benchmarks)" +#~ msgstr "" +#~ "详细信息请参阅 [benchmark](https://github.com/vllm-project" +#~ "/vllm-kunlun/tree/main/benchmarks)" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_node_kimi.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_node_kimi.po new file mode 100644 index 0000000..4c46098 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_node_kimi.po @@ -0,0 +1,30 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun \n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/tutorials/multi_node_kimi.md:1 +msgid "Multi-Node-DP (Kimi-K2)" +msgstr "" + +#: ../../source/tutorials/multi_node_kimi.md:3 +msgid "Verify Multi-Node Communication Environment" +msgstr "" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_node_pd_disaggregation_llmdatadist.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_node_pd_disaggregation_llmdatadist.po new file mode 100644 index 0000000..654cd1a --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_node_pd_disaggregation_llmdatadist.po @@ -0,0 +1,30 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun \n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/tutorials/multi_node_pd_disaggregation_llmdatadist.md:1 +msgid "Prefill-Decode Disaggregation Llmdatadist Verification (Qwen)" +msgstr "" + +#: ../../source/tutorials/multi_node_pd_disaggregation_llmdatadist.md:3 +msgid "Getting Start" +msgstr "" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_node_pd_disaggregation_mooncake.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_node_pd_disaggregation_mooncake.po new file mode 100644 index 0000000..039f442 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_node_pd_disaggregation_mooncake.po @@ -0,0 +1,30 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun \n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/tutorials/multi_node_pd_disaggregation_mooncake.md:1 +msgid "Prefill-Decode Disaggregation Mooncake Verification (Qwen)" +msgstr "" + +#: ../../source/tutorials/multi_node_pd_disaggregation_mooncake.md:3 +msgid "Getting Start" +msgstr "" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_node_qwen3vl.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_node_qwen3vl.po new file mode 100644 index 0000000..48d1aaa --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_node_qwen3vl.po @@ -0,0 +1,26 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun \n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/tutorials/multi_node_qwen3vl.md:1 +msgid "Multi-Node-DP (Qwen3-VL-235B-A22B)" +msgstr "" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_node_ray.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_node_ray.po new file mode 100644 index 0000000..15b7003 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_node_ray.po @@ -0,0 +1,26 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun \n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/tutorials/multi_node_ray.md:1 +msgid "Multi-Node-Ray (Qwen/Qwen3-235B-A22B)" +msgstr "" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_npu.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_npu.po new file mode 100644 index 0000000..c062a4b --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_npu.po @@ -0,0 +1,53 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/tutorials/multi_npu.md:1 +msgid "Multi-XPU (QwQ 32B)" +msgstr "多-XPU(QwQ 32B)" + +#~ msgid "Run vllm-kunlun on Multi-XPU" +#~ msgstr "在多XPU上运行 vllm-kunlun" + +#~ msgid "Run docker container:" +#~ msgstr "运行 docker 容器:" + +#~ msgid "Setup environment variables:" +#~ msgstr "设置环境变量:" + +#~ msgid "Online Inference on Multi-XPU" +#~ msgstr "多XPU的在线推理" + +#~ msgid "Run the following script to start the vLLM server on Multi-XPU:" +#~ msgstr "运行以下脚本,在多XPU上启动 vLLM 服务器:" + +#~ msgid "Once your server is started, you can query the model with input prompts" +#~ msgstr "一旦服务器启动,就可以通过输入提示词来查询模型。" + +#~ msgid "Offline Inference on Multi-XPU" +#~ msgstr "多XPU离线推理" + +#~ msgid "Run the following script to execute offline inference on multi-XPU:" +#~ msgstr "运行以下脚本以在多XPU上执行离线推理:" + +#~ msgid "If you run this script successfully, you can see the info shown below:" +#~ msgstr "如果你成功运行此脚本,你可以看到如下所示的信息:" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_npu_moge.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_npu_moge.po new file mode 100644 index 0000000..76e7b6f --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_npu_moge.po @@ -0,0 +1,74 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/tutorials/multi_npu_moge.md:1 +msgid "Multi-XPU (Pangu Pro MoE)" +msgstr "多XPU(Pangu Pro MoE)" + +#~ msgid "Run vllm-kunlun on Multi-XPU" +#~ msgstr "在多XPU上运行 vllm-kunlun" + +#~ msgid "Run container:" +#~ msgstr "运行容器:" + +#~ msgid "Setup environment variables:" +#~ msgstr "设置环境变量:" + +#~ msgid "Download the model:" +#~ msgstr "下载该模型:" + +#~ msgid "Online Inference on Multi-XPU" +#~ msgstr "多XPU上的在线推理" + +#~ msgid "Run the following script to start the vLLM server on Multi-XPU:" +#~ msgstr "运行以下脚本,在多XPU上启动 vLLM 服务器:" + +#~ msgid "" +#~ "Once your server is started, you " +#~ "can query the model with input " +#~ "prompts:" +#~ msgstr "一旦你的服务器启动,你可以通过输入提示词来查询模型:" + +#~ msgid "v1/completions" +#~ msgstr "v1/补全" + +#~ msgid "v1/chat/completions" +#~ msgstr "v1/chat/completions" + +#~ msgid "If you run this successfully, you can see the info shown below:" +#~ msgstr "如果你成功运行这个,你可以看到如下所示的信息:" + +#~ msgid "Offline Inference on Multi-XPU" +#~ msgstr "多XPU离线推理" + +#~ msgid "Run the following script to execute offline inference on multi-XPU:" +#~ msgstr "运行以下脚本以在多XPU上执行离线推理:" + +#~ msgid "Graph Mode" +#~ msgstr "图模式" + +#~ msgid "Eager Mode" +#~ msgstr "即时模式" + +#~ msgid "If you run this script successfully, you can see the info shown below:" +#~ msgstr "如果你成功运行此脚本,你可以看到如下所示的信息:" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_npu_quantization.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_npu_quantization.po new file mode 100644 index 0000000..85b80f2 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_npu_quantization.po @@ -0,0 +1,82 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/tutorials/multi_npu_quantization.md:1 +msgid "Multi-XPU (QwQ 32B W8A8)" +msgstr "多XPU(QwQ 32B W8A8)" + +#: ../../source/tutorials/multi_npu_quantization.md:3 +#, fuzzy +msgid "Run Docker Container" +msgstr "运行 docker 容器" + +#~ msgid "w8a8 quantization feature is supported by v0.8.4rc2 or higher" +#~ msgstr "w8a8 量化功能由 v0.8.4rc2 或更高版本支持" + +#~ msgid "Install modelslim and convert model" +#~ msgstr "安装 modelslim 并转换模型" + +#~ msgid "" +#~ "You can choose to convert the " +#~ "model yourself or use the quantized " +#~ "model we uploaded, see " +#~ "https://www.modelscope.cn/models/vllm-kunlun/QwQ-32B-" +#~ "W8A8" +#~ msgstr "" +#~ "你可以选择自己转换模型,或者使用我们上传的量化模型,详见 https://www.modelscope.cn/models" +#~ "/vllm-kunlun/QwQ-32B-W8A8" + +#~ msgid "Verify the quantized model" +#~ msgstr "验证量化模型" + +#~ msgid "The converted model files looks like:" +#~ msgstr "转换后的模型文件如下所示:" + +#~ msgid "Run the following script to start the vLLM server with quantized model:" +#~ msgstr "运行以下脚本以启动带有量化模型的 vLLM 服务器:" + +#~ msgid "" +#~ "The value \"kunlun\" for \"--" +#~ "quantization\" argument will be supported " +#~ "after [a specific PR](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/877) is merged and" +#~ " released, you can cherry-pick this" +#~ " commit for now." +#~ msgstr "" +#~ "在 [特定的PR](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/877) 合并并发布后, \"--quantization\" " +#~ "参数将支持值 \"kunlun\",你也可以现在手动挑选该提交。" + +#~ msgid "Once your server is started, you can query the model with input prompts" +#~ msgstr "一旦服务器启动,就可以通过输入提示词来查询模型。" + +#~ msgid "" +#~ "Run the following script to execute " +#~ "offline inference on multi-XPU with " +#~ "quantized model:" +#~ msgstr "运行以下脚本,在多XPU上使用量化模型执行离线推理:" + +#~ msgid "" +#~ "To enable quantization for kunlun, " +#~ "quantization method must be \"kunlun\"" +#~ msgstr "要在kunlun上启用量化,量化方法必须为“kunlun”。" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_npu_qwen3_moe.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_npu_qwen3_moe.po new file mode 100644 index 0000000..9f947ec --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_npu_qwen3_moe.po @@ -0,0 +1,63 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/tutorials/multi_npu_qwen3_moe.md:1 +msgid "Multi-XPU (Qwen3-30B-A3B)" +msgstr "多XPU(Qwen3-30B-A3B)" + +#~ msgid "Run vllm-kunlun on Multi-XPU with Qwen3 MoE" +#~ msgstr "在多XPU上运行带有Qwen3 MoE的vllm-kunlun" + +#~ msgid "Run docker container:" +#~ msgstr "运行 docker 容器:" + +#~ msgid "Setup environment variables:" +#~ msgstr "设置环境变量:" + +#~ msgid "Online Inference on Multi-XPU" +#~ msgstr "多XPU的在线推理" + +#~ msgid "Run the following script to start the vLLM server on Multi-XPU:" +#~ msgstr "运行以下脚本以在多XPU上启动 vLLM 服务器:" + +#~ msgid "" +#~ "For an Atlas A2 with 64GB of " +#~ "XPU card memory, tensor-parallel-size" +#~ " should be at least 2, and for" +#~ " 32GB of memory, tensor-parallel-size" +#~ " should be at least 4." +#~ msgstr "" +#~ "对于拥有64GB XPU卡内存的Atlas A2,tensor-parallel-size" +#~ " 至少应为2;对于32GB内存的XPU卡,tensor-parallel-size 至少应为4。" + +#~ msgid "Once your server is started, you can query the model with input prompts" +#~ msgstr "一旦服务器启动,就可以通过输入提示词来查询模型。" + +#~ msgid "Offline Inference on Multi-XPU" +#~ msgstr "多XPU离线推理" + +#~ msgid "Run the following script to execute offline inference on multi-XPU:" +#~ msgstr "运行以下脚本以在多XPU上执行离线推理:" + +#~ msgid "If you run this script successfully, you can see the info shown below:" +#~ msgstr "如果你成功运行此脚本,你可以看到如下所示的信息:" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_npu_qwen3_next.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_npu_qwen3_next.po new file mode 100644 index 0000000..a7eaacf --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_npu_qwen3_next.po @@ -0,0 +1,26 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun \n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/tutorials/multi_npu_qwen3_next.md:1 +msgid "Multi-XPU (Qwen3-Next)" +msgstr "" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_node_300i.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_node_300i.po new file mode 100644 index 0000000..6087db2 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_node_300i.po @@ -0,0 +1,94 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/tutorials/single_node_300i.md:1 +#, fuzzy +msgid "Single Node (Atlas 300I Series)" +msgstr "单节点(Atlas 300I 系列)" + +#~ msgid "" +#~ "This Atlas 300I series is currently " +#~ "experimental. In future versions, there " +#~ "may be behavioral changes around model" +#~ " coverage, performance improvement." +#~ msgstr "Atlas 300I 系列目前处于实验阶段。在未来的版本中,模型覆盖范围和性能提升方面可能会有行为上的变化。" + +#~ msgid "Run vLLM on Altlas 300I series" +#~ msgstr "在 Altlas 300I 系列上运行 vLLM" + +#~ msgid "Run docker container:" +#~ msgstr "运行 docker 容器:" + +#~ msgid "Setup environment variables:" +#~ msgstr "设置环境变量:" + +#~ msgid "Online Inference on XPU" +#~ msgstr "在XPU上进行在线推理" + +#~ msgid "" +#~ "Run the following script to start " +#~ "the vLLM server on XPU(Qwen3-0.6B:1 " +#~ "card, Qwen2.5-7B-Instruct:2 cards, Pangu-" +#~ "Pro-MoE-72B: 8 cards):" +#~ msgstr "" +#~ "运行以下脚本,在 XPU 上启动 vLLM 服务器(Qwen3-0.6B:1 " +#~ "张卡,Qwen2.5-7B-Instruct:2 张卡,Pangu-Pro-MoE-" +#~ "72B:8 张卡):" + +#~ msgid "Qwen3-0.6B" +#~ msgstr "Qwen3-0.6B" + +#~ msgid "Run the following command to start the vLLM server:" +#~ msgstr "运行以下命令以启动 vLLM 服务器:" + +#~ msgid "Once your server is started, you can query the model with input prompts" +#~ msgstr "一旦服务器启动,就可以通过输入提示词来查询模型。" + +#~ msgid "Qwen/Qwen2.5-7B-Instruct" +#~ msgstr "Qwen/Qwen2.5-7B-Instruct" + +#~ msgid "Pangu-Pro-MoE-72B" +#~ msgstr "Pangu-Pro-MoE-72B" + +#~ msgid "Download the model:" +#~ msgstr "下载该模型:" + +#~ msgid "If you run this script successfully, you can see the results." +#~ msgstr "如果你成功运行此脚本,你就可以看到结果。" + +#~ msgid "Offline Inference" +#~ msgstr "离线推理" + +#~ msgid "" +#~ "Run the following script (`example.py`) " +#~ "to execute offline inference on XPU:" +#~ msgstr "运行以下脚本(`example.py`)以在 XPU 上执行离线推理:" + +#~ msgid "Qwen2.5-7B-Instruct" +#~ msgstr "Qwen2.5-7B-指令版" + +#~ msgid "Run script:" +#~ msgstr "运行脚本:" + +#~ msgid "If you run this script successfully, you can see the info shown below:" +#~ msgstr "如果你成功运行此脚本,你可以看到如下所示的信息:" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu.po new file mode 100644 index 0000000..4393363 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu.po @@ -0,0 +1,106 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/tutorials/single_npu.md:1 +msgid "Single XPU (Qwen3 8B)" +msgstr "单个XPU(Qwen3 8B)" + +#: ../../source/tutorials/single_npu.md:3 +msgid "Run vllm-kunlun on Single XPU" +msgstr "在单个 XPU 上运行 vllm-kunlun" + +#: ../../source/tutorials/single_npu.md:5 +msgid "Offline Inference on Single XPU" +msgstr "在单个XPU上进行离线推理" + +#~ msgid "Run docker container:" +#~ msgstr "运行 docker 容器:" + +#~ msgid "Setup environment variables:" +#~ msgstr "设置环境变量:" + +#~ msgid "" +#~ "`max_split_size_mb` prevents the native " +#~ "allocator from splitting blocks larger " +#~ "than this size (in MB). This can" +#~ " reduce fragmentation and may allow " +#~ "some borderline workloads to complete " +#~ "without running out of memory. You " +#~ "can find more details " +#~ "[here](https://www.hikunlun.com/document/detail/zh/CANNCommunityEdition/800alpha003/apiref/envref/envref_07_0061.html)." +#~ msgstr "" +#~ "`max_split_size_mb` 防止本地分配器拆分超过此大小(以 MB " +#~ "为单位)的内存块。这可以减少内存碎片,并且可能让一些边缘情况下的工作负载顺利完成而不会耗尽内存。你可以在[这里](https://www.hikunlun.com/document/detail/zh/CANNCommunityEdition/800alpha003/apiref/envref/envref_07_0061.html)找到更多详细信息。" + +#~ msgid "Run the following script to execute offline inference on a single XPU:" +#~ msgstr "运行以下脚本以在单个 XPU 上执行离线推理:" + +#~ msgid "Graph Mode" +#~ msgstr "图模式" + +#~ msgid "Eager Mode" +#~ msgstr "即时模式" + +#~ msgid "If you run this script successfully, you can see the info shown below:" +#~ msgstr "如果你成功运行此脚本,你可以看到如下所示的信息:" + +#~ msgid "Online Serving on Single XPU" +#~ msgstr "单个 XPU 上的在线服务" + +#~ msgid "Run docker container to start the vLLM server on a single XPU:" +#~ msgstr "运行 docker 容器,在单个 XPU 上启动 vLLM 服务器:" + +#~ msgid "" +#~ "Add `--max_model_len` option to avoid " +#~ "ValueError that the Qwen2.5-7B model's " +#~ "max seq len (32768) is larger than" +#~ " the maximum number of tokens that" +#~ " can be stored in KV cache " +#~ "(26240). This will differ with different" +#~ " XPU series base on the HBM " +#~ "size. Please modify the value according" +#~ " to a suitable value for your " +#~ "XPU series." +#~ msgstr "" +#~ "添加 `--max_model_len` 选项,以避免出现 Qwen2.5-7B " +#~ "模型的最大序列长度(32768)大于 KV 缓存能存储的最大 token " +#~ "数(26240)时的 ValueError。不同 XPU 系列由于 HBM " +#~ "容量不同,该值也会有所不同。请根据您的 XPU 系列,修改为合适的数值。" + +#~ msgid "If your service start successfully, you can see the info shown below:" +#~ msgstr "如果你的服务启动成功,你会看到如下所示的信息:" + +#~ msgid "" +#~ "Once your server is started, you " +#~ "can query the model with input " +#~ "prompts:" +#~ msgstr "一旦你的服务器启动,你可以通过输入提示词来查询模型:" + +#~ msgid "" +#~ "If you query the server successfully," +#~ " you can see the info shown " +#~ "below (client):" +#~ msgstr "如果你成功查询了服务器,你可以看到如下所示的信息(客户端):" + +#~ msgid "Logs of the vllm server:" +#~ msgstr "vllm 服务器的日志:" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu_audio.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu_audio.po new file mode 100644 index 0000000..40b5a55 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu_audio.po @@ -0,0 +1,77 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../tutorials/single_npu_audio.md:1 +msgid "Single XPU (Qwen2-Audio 7B)" +msgstr "单个 XPU(Qwen2-Audio 7B)" + +#: ../../tutorials/single_npu_audio.md:3 +msgid "Run vllm-kunlun on Single XPU" +msgstr "在单个 XPU 上运行 vllm-kunlun" + +#: ../../tutorials/single_npu_audio.md:5 +msgid "Offline Inference on Single XPU" +msgstr "在单个XPU上进行离线推理" + +#: ../../tutorials/single_npu_audio.md:7 +msgid "Run docker container:" +msgstr "运行 docker 容器:" + +#: ../../tutorials/single_npu_audio.md:29 +msgid "Setup environment variables:" +msgstr "设置环境变量:" + +#: ../../tutorials/single_npu_audio.md:40 +msgid "" +"`max_split_size_mb` prevents the native allocator from splitting blocks " +"larger than this size (in MB). This can reduce fragmentation and may allow " +"some borderline workloads to complete without running out of memory. You can" +" find more details " +"[here](https://www.hikunlun.com/document/detail/zh/CANNCommunityEdition/800alpha003/apiref/envref/envref_07_0061.html)." +msgstr "" +"`max_split_size_mb` 防止本地分配器拆分超过此大小(以 MB " +"为单位)的内存块。这可以减少内存碎片,并且可能让一些边缘情况下的工作负载顺利完成而不会耗尽内存。你可以在[这里](https://www.hikunlun.com/document/detail/zh/CANNCommunityEdition/800alpha003/apiref/envref/envref_07_0061.html)找到更多详细信息。" + +#: ../../tutorials/single_npu_audio.md:43 +msgid "Install packages required for audio processing:" +msgstr "安装音频处理所需的软件包:" + +#: ../../tutorials/single_npu_audio.md:50 +msgid "Run the following script to execute offline inference on a single XPU:" +msgstr "运行以下脚本以在单个 XPU 上执行离线推理:" + +#: ../../tutorials/single_npu_audio.md:114 +msgid "If you run this script successfully, you can see the info shown below:" +msgstr "如果你成功运行此脚本,你可以看到如下所示的信息:" + +#: ../../tutorials/single_npu_audio.md:120 +msgid "Online Serving on Single XPU" +msgstr "单个 XPU 上的在线服务" + +#: ../../tutorials/single_npu_audio.md:122 +msgid "" +"Currently, vllm's OpenAI-compatible server doesn't support audio inputs, " +"find more details [here](https://github.com/vllm-" +"project/vllm/issues/19977)." +msgstr "" +"目前,vllm 的兼容 OpenAI 的服务器不支持音频输入,更多详情请查看[这里](https://github.com/vllm-" +"project/vllm/issues/19977)。" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu_multimodal.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu_multimodal.po new file mode 100644 index 0000000..1f14a86 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu_multimodal.po @@ -0,0 +1,99 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../tutorials/single_npu_multimodal.md:1 +msgid "Single XPU (Qwen2.5-VL 7B)" +msgstr "单个XPU(Qwen2.5-VL 7B)" + +#: ../../tutorials/single_npu_multimodal.md:3 +msgid "Run vllm-kunlun on Single XPU" +msgstr "在单个 XPU 上运行 vllm-kunlun" + +#: ../../tutorials/single_npu_multimodal.md:5 +msgid "Offline Inference on Single XPU" +msgstr "在单个XPU上进行离线推理" + +#: ../../tutorials/single_npu_multimodal.md:7 +msgid "Run docker container:" +msgstr "运行 docker 容器:" + +#: ../../tutorials/single_npu_multimodal.md:29 +msgid "Setup environment variables:" +msgstr "设置环境变量:" + +#: ../../tutorials/single_npu_multimodal.md:40 +msgid "" +"`max_split_size_mb` prevents the native allocator from splitting blocks " +"larger than this size (in MB). This can reduce fragmentation and may allow " +"some borderline workloads to complete without running out of memory. You can" +" find more details " +"[here](https://www.hikunlun.com/document/detail/zh/CANNCommunityEdition/800alpha003/apiref/envref/envref_07_0061.html)." +msgstr "" +"`max_split_size_mb` 防止本地分配器拆分超过此大小(以 MB " +"为单位)的内存块。这可以减少内存碎片,并且可能让一些边缘情况下的工作负载顺利完成而不会耗尽内存。你可以在[这里](https://www.hikunlun.com/document/detail/zh/CANNCommunityEdition/800alpha003/apiref/envref/envref_07_0061.html)找到更多详细信息。" + +#: ../../tutorials/single_npu_multimodal.md:43 +msgid "Run the following script to execute offline inference on a single XPU:" +msgstr "运行以下脚本以在单个 XPU 上执行离线推理:" + +#: ../../tutorials/single_npu_multimodal.md:109 +msgid "If you run this script successfully, you can see the info shown below:" +msgstr "如果你成功运行此脚本,你可以看到如下所示的信息:" + +#: ../../tutorials/single_npu_multimodal.md:121 +msgid "Online Serving on Single XPU" +msgstr "单个 XPU 上的在线服务" + +#: ../../tutorials/single_npu_multimodal.md:123 +msgid "Run docker container to start the vLLM server on a single XPU:" +msgstr "运行 docker 容器,在单个 XPU 上启动 vLLM 服务器:" + +#: ../../tutorials/single_npu_multimodal.md:154 +msgid "" +"Add `--max_model_len` option to avoid ValueError that the " +"Qwen2.5-VL-7B-Instruct model's max seq len (128000) is larger than the " +"maximum number of tokens that can be stored in KV cache. This will differ " +"with different XPU series base on the HBM size. Please modify the value " +"according to a suitable value for your XPU series." +msgstr "" +"新增 `--max_model_len` 选项,以避免出现 ValueError,即 Qwen2.5-VL-7B-Instruct " +"模型的最大序列长度(128000)大于 KV 缓存可存储的最大 token 数。该数值会根据不同 XPU 系列的 HBM 大小而不同。请根据你的 XPU" +" 系列,将该值设置为合适的数值。" + +#: ../../tutorials/single_npu_multimodal.md:157 +msgid "If your service start successfully, you can see the info shown below:" +msgstr "如果你的服务启动成功,你会看到如下所示的信息:" + +#: ../../tutorials/single_npu_multimodal.md:165 +msgid "" +"Once your server is started, you can query the model with input prompts:" +msgstr "一旦你的服务器启动,你可以通过输入提示词来查询模型:" + +#: ../../tutorials/single_npu_multimodal.md:182 +msgid "" +"If you query the server successfully, you can see the info shown below " +"(client):" +msgstr "如果你成功查询了服务器,你可以看到如下所示的信息(客户端):" + +#: ../../tutorials/single_npu_multimodal.md:188 +msgid "Logs of the vllm server:" +msgstr "vllm 服务器的日志:" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu_qwen2.5_vl.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu_qwen2.5_vl.po new file mode 100644 index 0000000..e4124bb --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu_qwen2.5_vl.po @@ -0,0 +1,38 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun \n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/tutorials/single_npu_qwen2.5_vl.md:1 +msgid "Single XPU (Qwen2.5-VL 7B)" +msgstr "" + +#: ../../source/tutorials/single_npu_qwen2.5_vl.md:3 +msgid "Run vllm-kunlun on Single XPU" +msgstr "" + +#: ../../source/tutorials/single_npu_qwen2.5_vl.md:5 +msgid "Offline Inference on Single XPU" +msgstr "" + +#: ../../source/tutorials/single_npu_qwen2.5_vl.md:7 +msgid "Run docker container:" +msgstr "" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu_qwen2_audio.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu_qwen2_audio.po new file mode 100644 index 0000000..08f401d --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu_qwen2_audio.po @@ -0,0 +1,38 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun \n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/tutorials/single_npu_qwen2_audio.md:1 +msgid "Single XPU (Qwen2-Audio 7B)" +msgstr "" + +#: ../../source/tutorials/single_npu_qwen2_audio.md:3 +msgid "Run vllm-kunlun on Single XPU" +msgstr "" + +#: ../../source/tutorials/single_npu_qwen2_audio.md:5 +msgid "Offline Inference on Single XPU" +msgstr "" + +#: ../../source/tutorials/single_npu_qwen2_audio.md:7 +msgid "Run docker container:" +msgstr "" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu_qwen3_embedding.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu_qwen3_embedding.po new file mode 100644 index 0000000..108c864 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu_qwen3_embedding.po @@ -0,0 +1,77 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/tutorials/single_npu_qwen3_embedding.md:1 +msgid "Single XPU (Qwen3-Embedding-8B)" +msgstr "单个XPU(Qwen3-Embedding-8B)" + +#: ../../source/tutorials/single_npu_qwen3_embedding.md:3 +msgid "" +"The Qwen3 Embedding model series is the latest proprietary model of the " +"Qwen family," +msgstr "" + +#~ msgid "" +#~ "The Qwen3 Embedding model series is " +#~ "the latest proprietary model of the " +#~ "Qwen family, specifically designed for " +#~ "text embedding and ranking tasks. " +#~ "Building upon the dense foundational " +#~ "models of the Qwen3 series, it " +#~ "provides a comprehensive range of text" +#~ " embeddings and reranking models in " +#~ "various sizes (0.6B, 4B, and 8B). " +#~ "This guide describes how to run " +#~ "the model with vLLM Kunlun. Note " +#~ "that only 0.9.2rc1 and higher versions" +#~ " of vLLM Kunlun support the model." +#~ msgstr "" +#~ "Qwen3 Embedding 模型系列是 Qwen " +#~ "家族最新的专有模型,专为文本嵌入和排序任务设计。在 Qwen3 " +#~ "系列的密集基础模型之上,它提供了多种尺寸(0.6B、4B 和 8B)的文本嵌入与重排序模型。本指南介绍如何使用" +#~ " vLLM Kunlun 运行该模型。请注意,只有 vLLM Kunlun " +#~ "0.9.2rc1 及更高版本才支持该模型。" + +#~ msgid "Run docker container" +#~ msgstr "运行 docker 容器" + +#~ msgid "" +#~ "Take Qwen3-Embedding-8B model as an " +#~ "example, first run the docker container" +#~ " with the following command:" +#~ msgstr "以 Qwen3-Embedding-8B 模型为例,首先使用以下命令运行 docker 容器:" + +#~ msgid "Setup environment variables:" +#~ msgstr "设置环境变量:" + +#~ msgid "Online Inference" +#~ msgstr "在线推理" + +#~ msgid "Once your server is started, you can query the model with input prompts" +#~ msgstr "一旦服务器启动,就可以通过输入提示词来查询模型。" + +#~ msgid "Offline Inference" +#~ msgstr "离线推理" + +#~ msgid "If you run this script successfully, you can see the info shown below:" +#~ msgstr "如果你成功运行此脚本,你可以看到如下所示的信息:" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu_qwen3_quantization.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu_qwen3_quantization.po new file mode 100644 index 0000000..0c4b1b3 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu_qwen3_quantization.po @@ -0,0 +1,30 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun \n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/tutorials/single_npu_qwen3_quantization.md:1 +msgid "Single-XPU (Qwen3 8B W4A8)" +msgstr "" + +#: ../../source/tutorials/single_npu_qwen3_quantization.md:3 +msgid "Run Docker Container" +msgstr "" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/configuration/additional_config.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/configuration/additional_config.po new file mode 100644 index 0000000..11df77d --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/configuration/additional_config.po @@ -0,0 +1,245 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/user_guide/configuration/additional_config.md:1 +msgid "Additional Configuration" +msgstr "附加配置" + +#~ msgid "" +#~ "additional configuration is a mechanism " +#~ "provided by vLLM to allow plugins " +#~ "to control inner behavior by their " +#~ "own. vLLM Kunlun uses this mechanism " +#~ "to make the project more flexible." +#~ msgstr "额外配置是 vLLM 提供的一种机制,允许插件自行控制内部行为。vLLM Kunlun 利用这种机制使项目更加灵活。" + +#~ msgid "How to use" +#~ msgstr "如何使用" + +#~ msgid "" +#~ "With either online mode or offline " +#~ "mode, users can use additional " +#~ "configuration. Take Qwen3 as an example:" +#~ msgstr "无论是在线模式还是离线模式,用户都可以使用额外的配置。以 Qwen3 为例:" + +#~ msgid "**Online mode**:" +#~ msgstr "**在线模式**:" + +#~ msgid "**Offline mode**:" +#~ msgstr "**离线模式**:" + +#~ msgid "Configuration options" +#~ msgstr "配置选项" + +#~ msgid "" +#~ "The following table lists the additional" +#~ " configuration options available in vLLM" +#~ " Kunlun:" +#~ msgstr "下表列出了 vLLM Kunlun 中可用的其他配置选项:" + +#~ msgid "Name" +#~ msgstr "名称" + +#~ msgid "Type" +#~ msgstr "类型" + +#~ msgid "Default" +#~ msgstr "默认" + +#~ msgid "Description" +#~ msgstr "描述" + +#~ msgid "`torchair_graph_config`" +#~ msgstr "`torchair_graph_config`" + +#~ msgid "dict" +#~ msgstr "dict" + +#~ msgid "`{}`" +#~ msgstr "`{}`" + +#~ msgid "The config options for torchair graph mode" +#~ msgstr "torchair 图模式的配置选项" + +#~ msgid "`kunlun_scheduler_config`" +#~ msgstr "`kunlun_scheduler_config`" + +#~ msgid "The config options for kunlun scheduler" +#~ msgstr "kunlun 调度器的配置选项" + +#~ msgid "`expert_tensor_parallel_size`" +#~ msgstr "`expert_tensor_parallel_size`" + +#~ msgid "str" +#~ msgstr "str" + +#~ msgid "`0`" +#~ msgstr "`0`" + +#~ msgid "Expert tensor parallel size the model to use." +#~ msgstr "专家张量并行的模型大小设置。" + +#~ msgid "`refresh`" +#~ msgstr "`刷新`" + +#~ msgid "bool" +#~ msgstr "bool" + +#~ msgid "`false`" +#~ msgstr "`false`" + +#~ msgid "" +#~ "Whether to refresh global kunlun config" +#~ " content. This value is usually used" +#~ " by rlhf or ut/e2e test case." +#~ msgstr "是否刷新全局 kunlun 配置信息。此值通常由 rlhf 或 ut/e2e 测试用例使用。" + +#~ msgid "`expert_map_path`" +#~ msgstr "`expert_map_path`" + +#~ msgid "`None`" +#~ msgstr "`None`" + +#~ msgid "" +#~ "When using expert load balancing for " +#~ "the MOE model, an expert map path" +#~ " needs to be passed in." +#~ msgstr "在为MOE模型使用专家负载均衡时,需要传入专家映射路径。" + +#~ msgid "`False`" +#~ msgstr "`False`" + +#~ msgid "Whether to enable the fused operator-like chunked_prefill." +#~ msgstr "是否启用类似算子融合的 chunked_prefill 功能。" + +#~ msgid "`kv_cache_dtype`" +#~ msgstr "`kv_cache_dtype`" + +#~ msgid "" +#~ "When using the kv cache quantization " +#~ "method, kv cache dtype needs to be" +#~ " set, currently only int8 is " +#~ "supported." +#~ msgstr "当使用kv缓存量化方法时,需要设置kv缓存的数据类型,目前仅支持int8。" + +#~ msgid "The details of each config option are as follows:" +#~ msgstr "每个配置选项的详细信息如下:" + +#~ msgid "**torchair_graph_config**" +#~ msgstr "**torchair_graph_config**" + +#~ msgid "`enabled`" +#~ msgstr "`启用`" + +#~ msgid "" +#~ "Whether to enable torchair graph mode." +#~ " Currently only DeepSeek series models " +#~ "and PanguProMoE are supported to use " +#~ "torchair graph mode" +#~ msgstr "是否启用 torchair 图模式。目前仅支持 DeepSeek 系列模型和 PanguProMoE 使用 torchair 图模式。" + +#~ msgid "`enable_multistream_mla`" +#~ msgstr "`enable_multistream_mla`" + +#~ msgid "" +#~ "Whether to put vector ops of MLA" +#~ " to another stream. This option only" +#~ " takes effects on models using MLA" +#~ " (e.g., DeepSeek)." +#~ msgstr "是否将MLA的向量操作放到另一个流中。此选项仅对使用MLA的模型(例如,DeepSeek)有效。" + +#~ msgid "`multistream_overlap_shared_expert`" +#~ msgstr "`multistream_overlap_shared_expert`" + +#~ msgid "" +#~ "Whether to enable multistream shared " +#~ "expert. This option only takes effects" +#~ " on DeepSeek moe models." +#~ msgstr "是否启用多流共享专家功能。此选项仅对 DeepSeek MoE 模型生效。" + +#~ msgid "`enable_view_optimize`" +#~ msgstr "`enable_view_optimize` (启用视图优化)" + +#~ msgid "`True`" +#~ msgstr "`True`" + +#~ msgid "Whether to enable torchair view optimization" +#~ msgstr "是否启用torchair视图优化" + +#~ msgid "`use_cached_graph`" +#~ msgstr "`use_cached_graph`" + +#~ msgid "Whether to use cached graph" +#~ msgstr "是否使用缓存的图" + +#~ msgid "`graph_batch_sizes`" +#~ msgstr "`graph_batch_sizes`" + +#~ msgid "list[int]" +#~ msgstr "list[int]" + +#~ msgid "`[]`" +#~ msgstr "`[]`" + +#~ msgid "The batch size for torchair graph cache" +#~ msgstr "torchair 图缓存的批量大小" + +#~ msgid "`graph_batch_sizes_init`" +#~ msgstr "`graph_batch_sizes_init`" + +#~ msgid "Init graph batch size dynamically if `graph_batch_sizes` is empty" +#~ msgstr "如果 `graph_batch_sizes` 为空,则动态初始化图批大小" + +#~ msgid "`enable_kv_nz`" +#~ msgstr "`enable_kv_nz`" + +#~ msgid "" +#~ "Whether to enable kvcache NZ layout. " +#~ "This option only takes effects on " +#~ "models using MLA (e.g., DeepSeek)." +#~ msgstr "是否启用 kvcache NZ 布局。此选项仅对使用 MLA 的模型(例如 DeepSeek)生效。" + +#~ msgid "**kunlun_scheduler_config**" +#~ msgstr "**kunlun_scheduler_config**" + +#~ msgid "Whether to enable kunlun scheduler for V1 engine" +#~ msgstr "是否为 V1 引擎启用 kunlun 调度器" + +#~ msgid "" +#~ "kunlun_scheduler_config also support the " +#~ "options from [vllm scheduler " +#~ "config](https://docs.vllm.ai/en/stable/api/vllm/config.html#vllm.config.SchedulerConfig)." +#~ " For example, you can add " +#~ "`enable_chunked_prefill: True` to " +#~ "kunlun_scheduler_config as well." +#~ msgstr "" +#~ "kunlun_scheduler_config 也支持来自 [vllm scheduler " +#~ "config](https://docs.vllm.ai/en/stable/api/vllm/config.html#vllm.config.SchedulerConfig)" +#~ " 的选项。例如,你也可以在 kunlun_scheduler_config 中添加 " +#~ "`enable_chunked_prefill: True`。" + +#~ msgid "Example" +#~ msgstr "示例" + +#~ msgid "An example of additional configuration is as follows:" +#~ msgstr "以下是额外配置的一个示例:" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/configuration/env_vars.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/configuration/env_vars.po new file mode 100644 index 0000000..c6f5934 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/configuration/env_vars.po @@ -0,0 +1,29 @@ +# Translations template for PROJECT. +# Copyright (C) 2025 ORGANIZATION +# This file is distributed under the same license as the PROJECT project. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: PROJECT VERSION\n" +"Report-Msgid-Bugs-To: EMAIL@ADDRESS\n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: LANGUAGE \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/user_guide/configuration/env_vars.md:1 +msgid "Environment Variables" +msgstr "环境变量" + +#~ msgid "" +#~ "vllm-kunlun uses the following " +#~ "environment variables to configure the " +#~ "system:" +#~ msgstr "vllm-kunlun 使用以下环境变量来配置系统:" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/configuration/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/configuration/index.po new file mode 100644 index 0000000..8183752 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/configuration/index.po @@ -0,0 +1,32 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 19:12+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/user_guide/configuration/index.md:1 +#: ../../source/user_guide/configuration/index.md:5 +msgid "Configuration Guide" +msgstr "配置指南" + +#: ../../source/user_guide/configuration/index.md:3 +#, fuzzy +msgid "This section provides a detailed configuration guide of vLLM Kunlun." +msgstr "本节提供了 vLLM Kunlun 的详细配置指南。" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/dynamic_batch.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/dynamic_batch.po new file mode 100644 index 0000000..d4bac69 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/dynamic_batch.po @@ -0,0 +1,26 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun \n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/user_guide/feature_guide/dynamic_batch.md:1 +msgid "Dynamic Batch" +msgstr "" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/eplb_swift_balancer.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/eplb_swift_balancer.po new file mode 100644 index 0000000..5e36029 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/eplb_swift_balancer.po @@ -0,0 +1,30 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun \n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/user_guide/feature_guide/eplb_swift_balancer.md:1 +msgid "Expert Load Balance (EPLB)" +msgstr "" + +#: ../../source/user_guide/feature_guide/eplb_swift_balancer.md:3 +msgid "Overview" +msgstr "" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/graph_mode.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/graph_mode.po new file mode 100644 index 0000000..5696a9d --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/graph_mode.po @@ -0,0 +1,126 @@ +# Translations template for PROJECT. +# Copyright (C) 2025 ORGANIZATION +# This file is distributed under the same license as the PROJECT project. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: PROJECT VERSION\n" +"Report-Msgid-Bugs-To: EMAIL@ADDRESS\n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: LANGUAGE \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/user_guide/feature_guide/graph_mode.md:1 +msgid "Graph Mode Guide" +msgstr "图模式指南" + +#~ msgid "" +#~ "This feature is currently experimental. " +#~ "In future versions, there may be " +#~ "behavioral changes around configuration, " +#~ "coverage, performance improvement." +#~ msgstr "此功能目前为实验性功能。在未来的版本中,配置、覆盖率和性能改进等方面的行为可能会有变化。" + +#~ msgid "" +#~ "This guide provides instructions for " +#~ "using Kunlun Graph Mode with vLLM " +#~ "Kunlun. Please note that graph mode " +#~ "is only available on V1 Engine. " +#~ "And only Qwen, DeepSeek series models" +#~ " are well tested from 0.9.0rc1. We'll" +#~ " make it stable and generalize in " +#~ "the next release." +#~ msgstr "" +#~ "本指南提供了在 vLLM Kunlun 上使用 Kunlun " +#~ "图模式的操作说明。请注意,图模式仅在 V1 引擎上可用,并且从 0.9.0rc1 起,仅对" +#~ " Qwen、DeepSeek 系列模型进行了充分测试。我们将在下一个版本中使其更加稳定和通用。" + +#~ msgid "Getting Started" +#~ msgstr "快速入门" + +#~ msgid "" +#~ "From v0.9.1rc1 with V1 Engine, vLLM " +#~ "Kunlun will run models in graph " +#~ "mode by default to keep the same" +#~ " behavior with vLLM. If you hit " +#~ "any issues, please feel free to " +#~ "open an issue on GitHub and " +#~ "fallback to eager mode temporarily by" +#~ " set `enforce_eager=True` when initializing " +#~ "the model." +#~ msgstr "" +#~ "从 v0.9.1rc1 版本起,使用 V1 引擎时,vLLM Kunlun" +#~ " 默认将在图模式下运行模型,以保持与 vLLM 同样的行为。如果遇到任何问题,欢迎在 GitHub" +#~ " 上提交 issue,并在初始化模型时通过设置 `enforce_eager=True` " +#~ "临时切换回 eager 模式。" + +#~ msgid "There are two kinds for graph mode supported by vLLM Kunlun:" +#~ msgstr "vLLM Kunlun 支持两种图模式:" + +#~ msgid "" +#~ "**ACLGraph**: This is the default graph" +#~ " mode supported by vLLM Kunlun. In" +#~ " v0.9.1rc1, only Qwen series models " +#~ "are well tested." +#~ msgstr "" +#~ "**ACLGraph**:这是 vLLM Kunlun 支持的默认图模式。在 " +#~ "v0.9.1rc1 版本中,只有 Qwen 系列模型得到了充分测试。" + +#~ msgid "" +#~ "**TorchAirGraph**: This is the GE graph" +#~ " mode. In v0.9.1rc1, only DeepSeek " +#~ "series models are supported." +#~ msgstr "**TorchAirGraph**:这是GE图模式。在v0.9.1rc1版本中,仅支持DeepSeek系列模型。" + +#~ msgid "Using ACLGraph" +#~ msgstr "使用 ACLGraph" + +#~ msgid "" +#~ "ACLGraph is enabled by default. Take " +#~ "Qwen series models as an example, " +#~ "just set to use V1 Engine is " +#~ "enough." +#~ msgstr "ACLGraph 默认启用。以 Qwen 系列模型为例,只需设置为使用 V1 引擎即可。" + +#~ msgid "offline example:" +#~ msgstr "离线示例:" + +#~ msgid "online example:" +#~ msgstr "在线示例:" + +#~ msgid "Using TorchAirGraph" +#~ msgstr "使用 TorchAirGraph" + +#~ msgid "" +#~ "If you want to run DeepSeek series" +#~ " models with graph mode, you should" +#~ " use " +#~ "[TorchAirGraph](https://www.hikunlun.com/document/detail/zh/Pytorch/700/modthirdparty/torchairuseguide/torchair_0002.html)." +#~ " In this case, additional config is" +#~ " required." +#~ msgstr "" +#~ "如果你想通过图模式运行 DeepSeek 系列模型,你应该使用 " +#~ "[TorchAirGraph](https://www.hikunlun.com/document/detail/zh/Pytorch/700/modthirdparty/torchairuseguide/torchair_0002.html)。在这种情况下,需要额外的配置。" + +#~ msgid "" +#~ "You can find more detail about " +#~ "additional config " +#~ "[here](../configuration/additional_config.md)." +#~ msgstr "你可以在[这里](../configuration/additional_config.md)找到关于附加配置的更多详细信息。" + +#~ msgid "Fallback to Eager Mode" +#~ msgstr "回退到 Eager 模式" + +#~ msgid "" +#~ "If both `ACLGraph` and `TorchAirGraph` " +#~ "fail to run, you should fallback " +#~ "to eager mode." +#~ msgstr "如果 `ACLGraph` 和 `TorchAirGraph` 都无法运行,你应该退回到 eager 模式。" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/index.po new file mode 100644 index 0000000..445c23b --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/index.po @@ -0,0 +1,32 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 19:12+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/user_guide/feature_guide/index.md:1 +#: ../../source/user_guide/feature_guide/index.md:5 +msgid "Feature Guide" +msgstr "功能指南" + +#: ../../source/user_guide/feature_guide/index.md:3 +#, fuzzy +msgid "This section provides a detailed usage guide of vLLM Kunlun features." +msgstr "本节提供了 vLLM Kunlun 功能的详细使用指南。" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/kv_pool_mooncake.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/kv_pool_mooncake.po new file mode 100644 index 0000000..cd61f95 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/kv_pool_mooncake.po @@ -0,0 +1,30 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun \n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/user_guide/feature_guide/kv_pool_mooncake.md:1 +msgid "Mooncacke Store Deployment Guide" +msgstr "" + +#: ../../source/user_guide/feature_guide/kv_pool_mooncake.md:3 +msgid "Environmental Dependencies" +msgstr "" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/lora.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/lora.po new file mode 100644 index 0000000..f1a9337 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/lora.po @@ -0,0 +1,68 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/user_guide/feature_guide/lora.md:1 +msgid "LoRA Adapters Guide" +msgstr "LoRA 适配器指南" + +#: ../../source/user_guide/feature_guide/lora.md:3 +msgid "Overview" +msgstr "" + +#~ msgid "" +#~ "Like vLLM, vllm-kunlun supports LoRA " +#~ "as well. The usage and more " +#~ "details can be found in [vLLM " +#~ "official " +#~ "document](https://docs.vllm.ai/en/latest/features/lora.html)." +#~ msgstr "" +#~ "与 vLLM 类似,vllm-kunlun 也支持 " +#~ "LoRA。用法及更多详情可参见 [vLLM " +#~ "官方文档](https://docs.vllm.ai/en/latest/features/lora.html)。" + +#~ msgid "" +#~ "You can also refer to " +#~ "[this](https://docs.vllm.ai/en/latest/models/supported_models.html" +#~ "#list-of-text-only-language-models) " +#~ "to find which models support LoRA " +#~ "in vLLM." +#~ msgstr "" +#~ "你也可以参考[这个链接](https://docs.vllm.ai/en/latest/models/supported_models.html" +#~ "#list-of-text-only-language-models)来查找哪些模型在" +#~ " vLLM 中支持 LoRA。" + +#~ msgid "Tips" +#~ msgstr "提示" + +#~ msgid "" +#~ "If you fail to run vllm-kunlun " +#~ "with LoRA, you may follow [this " +#~ "instruction](https://vllm-" +#~ "kunlun.readthedocs.io/en/latest/user_guide/feature_guide/graph_mode.html" +#~ "#fallback-to-eager-mode) to disable " +#~ "graph mode and try again." +#~ msgstr "" +#~ "如果你在使用 LoRA 运行 vllm-kunlun " +#~ "时失败,可以按照[此说明](https://vllm-" +#~ "kunlun.readthedocs.io/en/latest/user_guide/feature_guide/graph_mode.html" +#~ "#fallback-to-eager-mode)禁用图模式后再重试。" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/netloader.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/netloader.po new file mode 100644 index 0000000..dfc2975 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/netloader.po @@ -0,0 +1,26 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun \n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/user_guide/feature_guide/netloader.md:1 +msgid "Netloader Guide" +msgstr "" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/quantization.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/quantization.po new file mode 100644 index 0000000..2974097 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/quantization.po @@ -0,0 +1,198 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/user_guide/feature_guide/quantization.md:1 +msgid "Quantization Guide" +msgstr "量化指南" + +#~ msgid "" +#~ "Model quantization is a technique that" +#~ " reduces the size and computational " +#~ "requirements of a model by lowering " +#~ "the data precision of the weights " +#~ "and activation values in the model, " +#~ "thereby saving the memory and improving" +#~ " the inference speed." +#~ msgstr "模型量化是一种通过降低模型中权重和激活值的数据精度,从而减少模型大小和计算需求的技术,这样可以节省内存并提高推理速度。" + +#~ msgid "" +#~ "Since 0.9.0rc2 version, quantization feature" +#~ " is experimentally supported in vLLM " +#~ "Kunlun. Users can enable quantization " +#~ "feature by specifying `--quantization kunlun`." +#~ " Currently, only Qwen, DeepSeek series " +#~ "models are well tested. We’ll support" +#~ " more quantization algorithm and models " +#~ "in the future." +#~ msgstr "" +#~ "自 0.9.0rc2 版本起,vLLM Kunlun 实验性地支持量化特性。用户可以通过指定" +#~ " `--quantization kunlun` 启用量化功能。目前,只有 " +#~ "Qwen、DeepSeek 系列模型经过了充分测试。未来我们将支持更多的量化算法和模型。" + +#~ msgid "Install modelslim" +#~ msgstr "安装 modelslim" + +#~ msgid "" +#~ "To quantize a model, users should " +#~ "install " +#~ "[ModelSlim](https://gitcode.com/Kunlun/msit/blob/master/msmodelslim/README.md)" +#~ " which is the Kunlun compression and" +#~ " acceleration tool. It is an " +#~ "affinity-based compression tool designed " +#~ "for acceleration, using compression as " +#~ "its core technology and built upon " +#~ "the Kunlun platform." +#~ msgstr "要对模型进行量化,用户应安装[ModelSlim](https://gitcode.com/Kunlun/msit/blob/master/msmodelslim/README.md),这是昇腾的压缩与加速工具。它是一种基于亲和性的压缩工具,专为加速设计,以压缩为核心技术,并基于昇腾平台构建。" + +#~ msgid "" +#~ "Currently, only the specific tag " +#~ "[modelslim-" +#~ "VLLM-8.1.RC1.b020_001](https://gitcode.com/Kunlun/msit/blob" +#~ "/modelslim-VLLM-8.1.RC1.b020_001/msmodelslim/README.md) of" +#~ " modelslim works with vLLM Kunlun. " +#~ "Please do not install other version " +#~ "until modelslim master version is " +#~ "available for vLLM Kunlun in the " +#~ "future." +#~ msgstr "" +#~ "目前,只有 modelslim 的特定标签 [modelslim-" +#~ "VLLM-8.1.RC1.b020_001](https://gitcode.com/Kunlun/msit/blob" +#~ "/modelslim-VLLM-8.1.RC1.b020_001/msmodelslim/README.md) 支持" +#~ " vLLM Kunlun。在未来 modelslim 的主版本支持 vLLM " +#~ "Kunlun 之前,请不要安装其他版本。" + +#~ msgid "Install modelslim:" +#~ msgstr "安装 modelslim:" + +#~ msgid "Quantize model" +#~ msgstr "量化模型" + +#~ msgid "" +#~ "Take [DeepSeek-V2-Lite](https://modelscope.cn/models" +#~ "/deepseek-ai/DeepSeek-V2-Lite) as an example, " +#~ "you just need to download the " +#~ "model, and then execute the convert " +#~ "command. The command is shown below. " +#~ "More info can be found in " +#~ "modelslim doc [deepseek w8a8 dynamic " +#~ "quantization docs](https://gitcode.com/Kunlun/msit/blob" +#~ "/modelslim-" +#~ "VLLM-8.1.RC1.b020_001/msmodelslim/example/DeepSeek/README.md#deepseek-v2-w8a8-dynamic%E9%87%8F%E5%8C%96)." +#~ msgstr "" +#~ "以 [DeepSeek-V2-Lite](https://modelscope.cn/models/deepseek-" +#~ "ai/DeepSeek-V2-Lite) 为例,你只需要下载模型,然后执行转换命令。命令如下所示。更多信息可参考 " +#~ "modelslim 文档 [deepseek w8a8 " +#~ "动态量化文档](https://gitcode.com/Kunlun/msit/blob/modelslim-" +#~ "VLLM-8.1.RC1.b020_001/msmodelslim/example/DeepSeek/README.md#deepseek-v2-w8a8-dynamic%E9%87%8F%E5%8C%96)。" + +#~ msgid "" +#~ "You can also download the quantized " +#~ "model that we uploaded. Please note " +#~ "that these weights should be used " +#~ "for test only. For example, " +#~ "https://www.modelscope.cn/models/vllm-kunlun/DeepSeek-V2" +#~ "-Lite-W8A8" +#~ msgstr "" +#~ "你也可以下载我们上传的量化模型。请注意,这些权重仅应用于测试。例如:https://www.modelscope.cn/models" +#~ "/vllm-kunlun/DeepSeek-V2-Lite-W8A8" + +#~ msgid "Once convert action is done, there are two important files generated." +#~ msgstr "转换操作完成后,会生成两个重要的文件。" + +#~ msgid "" +#~ "[config.json](https://www.modelscope.cn/models/vllm-" +#~ "kunlun/DeepSeek-V2-Lite-" +#~ "W8A8/file/view/master/config.json?status=1). Please make" +#~ " sure that there is no " +#~ "`quantization_config` field in it." +#~ msgstr "" +#~ "[config.json](https://www.modelscope.cn/models/vllm-" +#~ "kunlun/DeepSeek-V2-Lite-" +#~ "W8A8/file/view/master/config.json?status=1)。请确保其中没有 " +#~ "`quantization_config` 字段。" + +#~ msgid "" +#~ "[quant_model_description.json](https://www.modelscope.cn/models" +#~ "/vllm-kunlun/DeepSeek-V2-Lite-" +#~ "W8A8/file/view/master/quant_model_description.json?status=1). " +#~ "All the converted weights info are " +#~ "recorded in this file." +#~ msgstr "" +#~ "[quant_model_description.json](https://www.modelscope.cn/models" +#~ "/vllm-kunlun/DeepSeek-V2-Lite-" +#~ "W8A8/file/view/master/quant_model_description.json?status=1)。所有被转换的权重信息都记录在该文件中。" + +#~ msgid "Here is the full converted model files:" +#~ msgstr "以下是完整转换后的模型文件:" + +#~ msgid "Run the model" +#~ msgstr "运行模型" + +#~ msgid "" +#~ "Now, you can run the quantized " +#~ "models with vLLM Kunlun. Here is " +#~ "the example for online and offline " +#~ "inference." +#~ msgstr "现在,你可以使用 vLLM Kunlun 运行量化模型。下面是在线和离线推理的示例。" + +#~ msgid "Offline inference" +#~ msgstr "离线推理" + +#~ msgid "Online inference" +#~ msgstr "在线推理" + +#~ msgid "FAQs" +#~ msgstr "常见问题解答" + +#~ msgid "" +#~ "1. How to solve the KeyError: " +#~ "'xxx.layers.0.self_attn.q_proj.weight' problem?" +#~ msgstr "1. 如何解决 KeyError: 'xxx.layers.0.self_attn.q_proj.weight' 问题?" + +#~ msgid "" +#~ "First, make sure you specify `kunlun`" +#~ " quantization method. Second, check if " +#~ "your model is converted by this " +#~ "`modelslim-VLLM-8.1.RC1.b020_001` modelslim version." +#~ " Finally, if it still doesn't work," +#~ " please submit a issue, maybe some" +#~ " new models need to be adapted." +#~ msgstr "" +#~ "首先,请确保你指定了 `kunlun` 量化方法。其次,检查你的模型是否由 `modelslim-" +#~ "VLLM-8.1.RC1.b020_001` 这个 modelslim " +#~ "版本转换。如果仍然无法使用,请提交一个 issue,可能有一些新模型需要适配。" + +#~ msgid "" +#~ "2. How to solve the error \"Could" +#~ " not locate the configuration_deepseek.py\"?" +#~ msgstr "2. 如何解决“无法找到 configuration_deepseek.py”错误?" + +#~ msgid "" +#~ "Please convert DeepSeek series models " +#~ "using `modelslim-VLLM-8.1.RC1.b020_001` modelslim," +#~ " this version has fixed the missing" +#~ " configuration_deepseek.py error." +#~ msgstr "" +#~ "请使用 `modelslim-VLLM-8.1.RC1.b020_001` 的 " +#~ "modelslim 转换 DeepSeek 系列模型,该版本已修复缺少 " +#~ "configuration_deepseek.py 的错误。" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/sleep_mode.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/sleep_mode.po new file mode 100644 index 0000000..504dca0 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/sleep_mode.po @@ -0,0 +1,165 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/user_guide/feature_guide/sleep_mode.md:1 +msgid "Sleep Mode Guide" +msgstr "睡眠模式指南" + +#: ../../source/user_guide/feature_guide/sleep_mode.md:3 +msgid "Overview" +msgstr "概述" + +#~ msgid "" +#~ "Sleep Mode is an API designed to" +#~ " offload model weights and discard KV" +#~ " cache from XPU memory. This " +#~ "functionality is essential for reinforcement" +#~ " learning (RL) post-training workloads, " +#~ "particularly in online algorithms such " +#~ "as PPO, GRPO, or DPO. During " +#~ "training, the policy model typically " +#~ "performs auto-regressive generation using " +#~ "inference engines like vLLM, followed by" +#~ " forward and backward passes for " +#~ "optimization." +#~ msgstr "" +#~ "Sleep Mode 是一个用于卸载模型权重并清除 XPU 内存中 KV " +#~ "缓存的 API。此功能对于强化学习(RL)后训练任务尤其重要,特别是在 PPO、GRPO 或 " +#~ "DPO 等在线算法中。在训练过程中,策略模型通常会使用像 vLLM " +#~ "这样的推理引擎进行自回归生成,然后进行前向和反向传播以进行优化。" + +#~ msgid "" +#~ "Since the generation and training phases" +#~ " may employ different model parallelism " +#~ "strategies, it becomes crucial to free" +#~ " KV cache and even offload model " +#~ "parameters stored within vLLM during " +#~ "training. This ensures efficient memory " +#~ "utilization and avoids resource contention " +#~ "on the XPU." +#~ msgstr "" +#~ "由于生成和训练阶段可能采用不同的模型并行策略,因此在训练过程中及时释放 KV 缓存,甚至卸载存储在 " +#~ "vLLM 内的模型参数变得至关重要。这可以确保内存的高效利用,并避免 XPU 上的资源争用。" + +#~ msgid "Getting started" +#~ msgstr "快速上手" + +#~ msgid "" +#~ "With `enable_sleep_mode=True`, the way we " +#~ "manage memory(malloc, free) in vllm will" +#~ " under a specific memory pool, during" +#~ " loading model and initialize kv_caches," +#~ " we tag the memory as a map:" +#~ " `{\"weight\": data, \"kv_cache\": data}`." +#~ msgstr "" +#~ "当 `enable_sleep_mode=True` 时,我们在 vllm " +#~ "中管理内存(malloc, free)的方式会在一个特定的内存池下进行,在加载模型和初始化 kv_caches" +#~ " 期间,我们会将内存打上标签,组织成一个映射:`{\"weight\": data, " +#~ "\"kv_cache\": data}`。" + +#~ msgid "" +#~ "The engine(v0/v1) supports two sleep " +#~ "levels to manage memory during idle " +#~ "periods:" +#~ msgstr "该引擎(v0/v1)支持两种睡眠等级,以在空闲期间管理内存:" + +#~ msgid "Level 1 Sleep" +#~ msgstr "一级睡眠" + +#~ msgid "Action: Offloads model weights and discards the KV cache." +#~ msgstr "操作:卸载模型权重并清除KV缓存。" + +#~ msgid "Memory: Model weights are moved to CPU memory; KV cache is forgotten." +#~ msgstr "内存:模型权重被移动到CPU内存;KV缓存被清除。" + +#~ msgid "Use Case: Suitable when reusing the same model later." +#~ msgstr "用例:适用于之后需要重复使用同一个模型的情况。" + +#~ msgid "" +#~ "Note: Ensure sufficient CPU memory is" +#~ " available to hold the model weights." +#~ msgstr "注意:请确保有足够的CPU内存来存储模型权重。" + +#~ msgid "Level 2 Sleep" +#~ msgstr "二级睡眠" + +#~ msgid "Action: Discards both model weights and KV cache." +#~ msgstr "操作:同时丢弃模型权重和KV缓存。" + +#~ msgid "" +#~ "Memory: The content of both the " +#~ "model weights and kv cache is " +#~ "forgotten." +#~ msgstr "内存:模型权重和kv缓存的内容都会被遗忘。" + +#~ msgid "" +#~ "Use Case: Ideal when switching to " +#~ "a different model or updating the " +#~ "current one." +#~ msgstr "用例:当切换到不同的模型或更新当前模型时非常理想。" + +#~ msgid "" +#~ "Since this feature uses the low-" +#~ "level API " +#~ "[KunlunCL](https://www.hikunlun.com/document/detail/zh/CANNCommunityEdition/82RC1alpha002/API/appdevgapi/appdevgapi_07_0000.html)," +#~ " in order to use sleep mode, " +#~ "you should follow the [installation " +#~ "guide](https://vllm-" +#~ "kunlun.readthedocs.io/en/latest/installation.html) and " +#~ "building from source, if you are " +#~ "using v0.7.3, remember to set `export" +#~ " COMPILE_CUSTOM_KERNELS=1`, for the latest " +#~ "version(v0.9.x+), the environment variable " +#~ "`COMPILE_CUSTOM_KERNELS` will be set 1 " +#~ "by default while building from source." +#~ msgstr "" +#~ "由于此功能使用了底层 API " +#~ "[KunlunCL](https://www.hikunlun.com/document/detail/zh/CANNCommunityEdition/82RC1alpha002/API/appdevgapi/appdevgapi_07_0000.html),为了使用休眠模式,你应按照[安装指南](https" +#~ "://vllm-" +#~ "kunlun.readthedocs.io/en/latest/installation.html)进行操作,并从源码编译。如果你使用的是" +#~ " v0.7.3,请记得设置 `export COMPILE_CUSTOM_KERNELS=1` " +#~ ";对于最新版本(v0.9.x+),在从源码编译时环境变量 `COMPILE_CUSTOM_KERNELS` " +#~ "默认会被设置为 1。" + +#~ msgid "Usage" +#~ msgstr "用法" + +#~ msgid "The following is a simple example of how to use sleep mode." +#~ msgstr "以下是如何使用睡眠模式的一个简单示例。" + +#~ msgid "offline inference:" +#~ msgstr "离线推理:" + +#~ msgid "online serving:" +#~ msgstr "在线服务:" + +#~ msgid "" +#~ "Considering there may be a risk of" +#~ " malicious access, please make sure " +#~ "you are under a dev-mode, and " +#~ "explicit specify the develop env: " +#~ "`VLLM_SERVER_DEV_MODE` to expose these " +#~ "endpoints(sleep/wake up)." +#~ msgstr "" +#~ "鉴于可能存在恶意访问的风险,请确保您处于开发模式,并明确指定开发环境:`VLLM_SERVER_DEV_MODE`,以便开放这些端点(sleep/wake" +#~ " up)。" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/structured_output.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/structured_output.po new file mode 100644 index 0000000..10613af --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/structured_output.po @@ -0,0 +1,235 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/user_guide/feature_guide/structured_output.md:1 +msgid "Structured Output Guide" +msgstr "结构化输出指南" + +#: ../../source/user_guide/feature_guide/structured_output.md:3 +msgid "Overview" +msgstr "概述" + +#: ../../source/user_guide/feature_guide/structured_output.md:5 +#, fuzzy +msgid "What is structured output?" +msgstr "什么是结构化输出?" + +#~ msgid "" +#~ "LLMs can be unpredictable when you " +#~ "need output in specific formats. Think" +#~ " of asking a model to generate " +#~ "JSON - without guidance, it might " +#~ "produce valid text that breaks JSON " +#~ "specification. **Structured Output (also " +#~ "called Guided Decoding)** enables LLMs " +#~ "to generate outputs that follow a " +#~ "desired structure while preserving the " +#~ "non-deterministic nature of the system." +#~ msgstr "" +#~ "当你需要特定格式输出时,大型语言模型(LLMs)可能表现出不可预测性。比如让模型生成 " +#~ "JSON,如果没有指导,模型可能会生成有效的文本,但这些文本却不符合 JSON " +#~ "规范。**结构化输出(也称为引导解码)** 能让大型语言模型生成符合预期结构的输出,同时保留系统的非确定性特性。" + +#~ msgid "" +#~ "In simple terms, structured decoding " +#~ "gives LLMs a “template” to follow. " +#~ "Users provide a schema that “influences”" +#~ " the model’s output, ensuring compliance" +#~ " with the desired structure." +#~ msgstr "简单来说,结构化解码为LLM提供了一个“模板”来遵循。用户提供一个模式来“影响”模型的输出,从而确保输出符合期望的结构。" + +#~ msgid "![structured decoding](./images/structured_output_1.png)" +#~ msgstr "![结构化解码](./images/structured_output_1.png)" + +#~ msgid "structured decoding" +#~ msgstr "结构化解码" + +#~ msgid "Structured Output in vllm-kunlun" +#~ msgstr "vllm-kunlun 中的结构化输出" + +#~ msgid "" +#~ "Currently, vllm-kunlun supports **xgrammar**" +#~ " and **guidance** backend for structured" +#~ " output with vllm v1 engine." +#~ msgstr "目前,vllm-kunlun 支持 vllm v1 引擎的结构化输出,后端包括 **xgrammar** 和 **guidance**。" + +#~ msgid "" +#~ "XGrammar introduces a new technique that" +#~ " batch constrained decoding via pushdown" +#~ " automaton (PDA). You can think of" +#~ " a PDA as a “collection of " +#~ "FSMs, and each FSM represents a " +#~ "context-free grammar (CFG).” One " +#~ "significant advantage of PDA is its " +#~ "recursive nature, allowing us to execute" +#~ " multiple state transitions. They also " +#~ "include additional optimisation (for those " +#~ "who are interested) to reduce grammar" +#~ " compilation overhead. Besides, you can " +#~ "also find more details about guidance" +#~ " by yourself." +#~ msgstr "" +#~ "XGrammar 引入了一种通过下推自动机(PDA)进行批量约束解码的新技术。你可以把 PDA " +#~ "理解为“有限状态机(FSM)的集合,每个 FSM 代表一个上下文无关文法(CFG)。” PDA " +#~ "的一个重要优点是其递归特性,使我们能够执行多次状态转移。此外,PDA " +#~ "还包含了额外的优化(供感兴趣的用户参考),以减少语法编译的开销。除此之外,你还可以自己找到更多关于指导的信息。" + +#~ msgid "How to Use Structured Output?" +#~ msgstr "如何使用结构化输出?" + +#~ msgid "Online Inference" +#~ msgstr "在线推理" + +#~ msgid "" +#~ "You can also generate structured outputs" +#~ " using the OpenAI's Completions and " +#~ "Chat API. The following parameters are" +#~ " supported, which must be added as" +#~ " extra parameters:" +#~ msgstr "你也可以使用 OpenAI 的 Completions 和 Chat API 生成结构化输出。支持以下参数,这些参数必须作为额外参数添加:" + +#~ msgid "`guided_choice`: the output will be exactly one of the choices." +#~ msgstr "`guided_choice`:输出将会是其中一个选项。" + +#~ msgid "`guided_regex`: the output will follow the regex pattern." +#~ msgstr "`guided_regex`:输出将遵循正则表达式模式。" + +#~ msgid "`guided_json`: the output will follow the JSON schema." +#~ msgstr "`guided_json`:输出将遵循 JSON 架构。" + +#~ msgid "`guided_grammar`: the output will follow the context free grammar." +#~ msgstr "`guided_grammar`:输出将遵循上下文无关文法。" + +#~ msgid "" +#~ "Structured outputs are supported by " +#~ "default in the OpenAI-Compatible Server." +#~ " You can choose to specify the " +#~ "backend to use by setting the " +#~ "`--guided-decoding-backend` flag to vllm" +#~ " serve. The default backend is " +#~ "`auto`, which will try to choose " +#~ "an appropriate backend based on the " +#~ "details of the request. You may " +#~ "also choose a specific backend, along" +#~ " with some options." +#~ msgstr "" +#~ "OpenAI 兼容服务器默认支持结构化输出。你可以通过设置 `--guided-decoding-" +#~ "backend` 标志为 vllm serve 来指定要使用的后端。默认后端为 " +#~ "`auto`,它会根据请求的详细信息尝试选择合适的后端。你也可以选择特定的后端,并设置一些选项。" + +#~ msgid "" +#~ "Now let´s see an example for each" +#~ " of the cases, starting with the " +#~ "guided_choice, as it´s the easiest one:" +#~ msgstr "现在让我们来看每种情况的示例,首先是 guided_choice,因为它是最简单的:" + +#~ msgid "" +#~ "The next example shows how to use" +#~ " the guided_regex. The idea is to " +#~ "generate an email address, given a " +#~ "simple regex template:" +#~ msgstr "下一个例子展示了如何使用 guided_regex。其思路是基于一个简单的正则表达式模板生成一个电子邮件地址:" + +#~ msgid "" +#~ "One of the most relevant features " +#~ "in structured text generation is the " +#~ "option to generate a valid JSON " +#~ "with pre-defined fields and formats. " +#~ "For this we can use the " +#~ "guided_json parameter in two different " +#~ "ways:" +#~ msgstr "" +#~ "在结构化文本生成中,最相关的特性之一是能够生成具有预定义字段和格式的有效 JSON。为此,我们可以通过两种不同的方式使用 " +#~ "guided_json 参数:" + +#~ msgid "Using a JSON Schema." +#~ msgstr "使用 JSON 架构。" + +#~ msgid "Defining a Pydantic model and then extracting the JSON Schema from it." +#~ msgstr "定义一个 Pydantic 模型,然后从中提取 JSON Schema。" + +#~ msgid "" +#~ "The next example shows how to use" +#~ " the guided_json parameter with a " +#~ "Pydantic model:" +#~ msgstr "下一个示例展示了如何将 guided_json 参数与 Pydantic 模型一起使用:" + +#~ msgid "" +#~ "Finally we have the guided_grammar " +#~ "option, which is probably the most " +#~ "difficult to use, but it´s really " +#~ "powerful. It allows us to define " +#~ "complete languages like SQL queries. It" +#~ " works by using a context free " +#~ "EBNF grammar. As an example, we " +#~ "can use to define a specific " +#~ "format of simplified SQL queries:" +#~ msgstr "" +#~ "最后,我们有 guided_grammar 选项,这可能是最难使用的,但它非常强大。它允许我们定义完整的语言,比如" +#~ " SQL 查询。它通过使用上下文无关的 EBNF 语法来实现。例如,我们可以用它来定义一种简化" +#~ " SQL 查询的特定格式:" + +#~ msgid "" +#~ "Find more examples [here](https://github.com" +#~ "/vllm-" +#~ "project/vllm/blob/main/examples/offline_inference/structured_outputs.py)." +#~ msgstr "" +#~ "在[这里](https://github.com/vllm-" +#~ "project/vllm/blob/main/examples/offline_inference/structured_outputs.py)可以找到更多示例。" + +#~ msgid "Offline Inference" +#~ msgstr "离线推理" + +#~ msgid "" +#~ "To use Structured Output, we'll need " +#~ "to configure the guided decoding using" +#~ " the class `GuidedDecodingParams` inside " +#~ "`SamplingParams`. The main available options" +#~ " inside `GuidedDecodingParams` are:" +#~ msgstr "" +#~ "要使用结构化输出,我们需要在 `SamplingParams` 内通过 " +#~ "`GuidedDecodingParams` 类配置引导解码。`GuidedDecodingParams` " +#~ "中主要可用的选项有:" + +#~ msgid "json" +#~ msgstr "json" + +#~ msgid "regex" +#~ msgstr "正则表达式" + +#~ msgid "choice" +#~ msgstr "选择" + +#~ msgid "grammar" +#~ msgstr "语法" + +#~ msgid "One example for the usage of the choice parameter is shown below:" +#~ msgstr "choice 参数用法的一个示例如下:" + +#~ msgid "" +#~ "Find more examples of other usages " +#~ "[here](https://github.com/vllm-" +#~ "project/vllm/blob/main/examples/offline_inference/structured_outputs.py)." +#~ msgstr "" +#~ "查看更多其他用法的示例 [在这里](https://github.com/vllm-" +#~ "project/vllm/blob/main/examples/offline_inference/structured_outputs.py)。" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/release_notes.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/release_notes.po new file mode 100644 index 0000000..65819da --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/release_notes.po @@ -0,0 +1,1880 @@ +# Chinese translations for PROJECT. +# Copyright (C) 2025 ORGANIZATION +# This file is distributed under the same license as the PROJECT project. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: PROJECT VERSION\n" +"Report-Msgid-Bugs-To: EMAIL@ADDRESS\n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: 2025-07-18 10:11+0800\n" +"Last-Translator: \n" +"Language: zh\n" +"Language-Team: \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/user_guide/release_notes.md:1 +#, fuzzy +msgid "Release Notes" +msgstr "版本说明" + +#~ msgid "v0.9.2rc1 - 2025.07.11" +#~ msgstr "" + +#~ msgid "" +#~ "This is the 1st release candidate " +#~ "of v0.9.2 for vLLM Kunlun. Please " +#~ "follow the [official doc](https://vllm-" +#~ "kunlun.readthedocs.io/en/) to get started. " +#~ "From this release, V1 engine will " +#~ "be enabled by default, there is no" +#~ " need to set `VLLM_USE_V1=1` any " +#~ "more. And this release is the last" +#~ " version to support V0 engine, V0 " +#~ "code will be clean up in the " +#~ "future." +#~ msgstr "" +#~ "这是 vLLM Kunlun v0.9.2 " +#~ "的第一个候选发布版本。请参阅[官方文档](https://vllm-" +#~ "kunlun.readthedocs.io/en/)开始使用。从本次发布起,V1 引擎将默认启用,不再需要设置 " +#~ "`VLLM_USE_V1=1`。此外,该版本也是最后一个支持 V0 引擎的版本,V0 " +#~ "相关代码将在未来被清理。" + +#~ msgid "Highlights" +#~ msgstr "亮点" + +#~ msgid "" +#~ "Pooling model works with V1 engine " +#~ "now. You can take a try with " +#~ "Qwen3 embedding model [#1359](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/1359)." +#~ msgstr "" +#~ "Pooling 模型现在可以与 V1 引擎一起使用。你可以尝试使用 Qwen3 " +#~ "embedding 模型 [#1359](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/1359)。" + +#~ msgid "" +#~ "The performance on Atlas 300I series " +#~ "has been improved. [#1591](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/1591)" +#~ msgstr "" +#~ "Atlas 300I 系列的性能已经提升。 [#1591](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/1591)" + +#~ msgid "" +#~ "aclgraph mode works with Moe models " +#~ "now. Currently, only Qwen3 Moe is " +#~ "well tested. [#1381](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/1381)" +#~ msgstr "" +#~ "aclgraph 模式现在可以与 Moe 模型一起使用。目前,仅对 Qwen3 " +#~ "Moe 进行了充分测试。[#1381](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/1381)" + +#~ msgid "Core" +#~ msgstr "核心" + +#~ msgid "" +#~ "Kunlun PyTorch adapter (torch_npu) has " +#~ "been upgraded to `2.5.1.post1.dev20250619`. " +#~ "Don’t forget to update it in your" +#~ " environment. [#1347](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/1347)" +#~ msgstr "" +#~ "Kunlun PyTorch 适配器(torch_npu)已升级到 " +#~ "`2.5.1.post1.dev20250619`。请不要忘记在您的环境中进行更新。 " +#~ "[#1347](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/1347)" + +#~ msgid "" +#~ "The **GatherV3** error has been fixed" +#~ " with **aclgraph** mode. " +#~ "[#1416](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/1416)" +#~ msgstr "" +#~ "**GatherV3** 错误已通过 **aclgraph** " +#~ "模式修复。[#1416](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/1416)" + +#~ msgid "" +#~ "W8A8 quantization works on Atlas 300I" +#~ " series now. [#1560](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/1560)" +#~ msgstr "" +#~ "W8A8 量化现在可以在 Atlas 300I " +#~ "系列上运行了。[#1560](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/1560)" + +#~ msgid "" +#~ "Fix the accuracy problem with deploy " +#~ "models with parallel parameters. " +#~ "[#1678](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/1678)" +#~ msgstr "" +#~ "修复了使用并行参数部署模型时的准确性问题。[#1678](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/1678)" + +#~ msgid "" +#~ "The pre-built wheel package now " +#~ "requires lower version of glibc. Users" +#~ " can use it by `pip install " +#~ "vllm-kunlun` directly. [#1582](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/1582)" +#~ msgstr "" +#~ "预编译的 wheel 包现在要求更低版本的 glibc。用户可以直接通过 `pip " +#~ "install vllm-kunlun` 使用它。[#1582](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/1582)" + +#~ msgid "Other" +#~ msgstr "其它" + +#~ msgid "" +#~ "Official doc has been updated for " +#~ "better read experience. For example, " +#~ "more deployment tutorials are added, " +#~ "user/developer docs are updated. More " +#~ "guide will coming soon." +#~ msgstr "官方文档已更新,以提升阅读体验。例如,增加了更多部署教程,用户/开发者文档已更新。更多指南即将推出。" + +#~ msgid "" +#~ "Fix accuracy problem for deepseek V3/R1" +#~ " models with torchair graph in long" +#~ " sequence predictions. [#1331](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/1331)" +#~ msgstr "" +#~ "修复 deepseek V3/R1 模型在使用 torchair " +#~ "图进行长序列预测时的精度问题。[#1331](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/1331)" + +#~ msgid "" +#~ "A new env variable " +#~ "`VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP` has been " +#~ "added. It enables the fused " +#~ "allgather-experts kernel for Deepseek V3/R1" +#~ " models. The default value is `0`." +#~ " [#1335](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/1335)" +#~ msgstr "" +#~ "新增了一个环境变量 `VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP`。它用于启用 " +#~ "Deepseek V3/R1 模型的 fused allgather-" +#~ "experts 内核。默认值为 `0`。[#1335](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/1335)" + +#~ msgid "" +#~ "A new env variable " +#~ "`VLLM_KUNLUN_ENABLE_TOPK_TOPP_OPTIMIZATION` has been " +#~ "added to improve the performance of " +#~ "topk-topp sampling. The default value" +#~ " is 0, we'll consider to enable " +#~ "it by default in the " +#~ "future[#1732](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/1732)" +#~ msgstr "" +#~ "新增了一个环境变量 `VLLM_KUNLUN_ENABLE_TOPK_TOPP_OPTIMIZATION`,用于提升 " +#~ "topk-topp 采样的性能。该变量默认值为 " +#~ "0,未来我们会考虑默认启用此选项[#1732](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/1732)。" + +#~ msgid "" +#~ "A batch of bugs have been fixed" +#~ " for Data Parallelism case " +#~ "[#1273](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/1273) [#1322](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/1322) [#1275](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/1275) [#1478](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/1478)" +#~ msgstr "" +#~ "已修复了一批与数据并行相关的 bug [#1273](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/1273) [#1322](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/1322) " +#~ "[#1275](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/1275) [#1478](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/1478)" + +#~ msgid "" +#~ "The DeepSeek performance has been " +#~ "improved. [#1194](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/1194) [#1395](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/1395) [#1380](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/1380)" +#~ msgstr "" +#~ "DeepSeek 的性能已得到提升。[#1194](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/1194) [#1395](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/1395) [#1380](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/1380)" + +#~ msgid "" +#~ "Kunlun scheduler works with prefix cache" +#~ " now. [#1446](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/1446)" +#~ msgstr "" +#~ "Kunlun 调度器现在支持前缀缓存。[#1446](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/1446)" + +#~ msgid "" +#~ "DeepSeek now works with prefix cache " +#~ "now. [#1498](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/1498)" +#~ msgstr "" +#~ "DeepSeek 现在支持前缀缓存了。[#1498](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/1498)" + +#~ msgid "" +#~ "Support prompt logprobs to recover ceval" +#~ " accuracy in V1 [#1483](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/1483)" +#~ msgstr "" +#~ "支持使用 prompt logprobs 恢复 V1 的 ceval" +#~ " 准确率 [#1483](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/1483)" + +#~ msgid "v0.9.1rc1 - 2025.06.22" +#~ msgstr "v0.9.1rc1 - 2025.06.22" + +#~ msgid "" +#~ "This is the 1st release candidate " +#~ "of v0.9.1 for vLLM Kunlun. Please " +#~ "follow the [official doc](https://vllm-" +#~ "kunlun.readthedocs.io/en/) to get started." +#~ msgstr "" +#~ "这是 vLLM Kunlun v0.9.1 " +#~ "的第一个候选发布版本。请按照[官方文档](https://vllm-" +#~ "kunlun.readthedocs.io/en/)开始使用。" + +#~ msgid "" +#~ "Atlas 300I series is experimental " +#~ "supported in this release. " +#~ "[#1333](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/1333) After careful consideration, " +#~ "this feature **will NOT be included " +#~ "in v0.9.1-dev branch** taking into " +#~ "account the v0.9.1 release quality and" +#~ " the feature rapid iteration to " +#~ "improve performance on Atlas 300I " +#~ "series. We will improve this from " +#~ "0.9.2rc1 and later." +#~ msgstr "" +#~ "本版本对 Atlas 300I " +#~ "系列提供了实验性支持。[#1333](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/1333) 经过慎重考虑,鉴于 v0.9.1 版本发布的质量要求以及 " +#~ "Atlas 300I 系列性能优化的快速迭代,该功能**不会被包含在 v0.9.1-dev " +#~ "分支中**。我们将在 0.9.2rc1 及之后的版本中进一步完善该功能。" + +#~ msgid "" +#~ "Support EAGLE-3 for speculative decoding. " +#~ "[#1032](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/1032)" +#~ msgstr "" +#~ "支持 EAGLE-3 进行推测式解码。[#1032](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/1032)" + +#~ msgid "" +#~ "Kunlun PyTorch adapter (torch_npu) has " +#~ "been upgraded to `2.5.1.post1.dev20250528`. " +#~ "Don’t forget to update it in your" +#~ " environment. [#1235](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/1235)" +#~ msgstr "" +#~ "Kunlun PyTorch 适配器(torch_npu)已升级到 " +#~ "`2.5.1.post1.dev20250528`。请不要忘记在您的环境中进行更新。[#1235](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/1235)" + +#~ msgid "" +#~ "Support Atlas 300I series container " +#~ "image. You can get it from " +#~ "[quay.io](https://quay.io/repository/vllm/vllm-kunlun)" +#~ msgstr "" +#~ "支持Atlas " +#~ "300I系列的容器镜像。你可以从[quay.io](https://quay.io/repository/vllm/vllm-" +#~ "kunlun)获取。" + +#~ msgid "" +#~ "Fix token-wise padding mechanism to " +#~ "make multi-card graph mode work. " +#~ "[#1300](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/1300)" +#~ msgstr "" +#~ "修复按 token 填充机制以支持多卡图模式。 [#1300](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/1300)" + +#~ msgid "" +#~ "Upgrade vllm to 0.9.1 " +#~ "[#1165]https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/1165" +#~ msgstr "" +#~ "将 vllm 升级到 0.9.1 [#1165]https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/1165" + +#~ msgid "Other Improvements" +#~ msgstr "其他改进" + +#~ msgid "" +#~ "Initial support Chunked Prefill for MLA." +#~ " [#1172](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/1172)" +#~ msgstr "" +#~ "为MLA初步支持分块预填充。 [#1172](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/1172)" + +#~ msgid "" +#~ "An example of best practices to " +#~ "run DeepSeek with ETP has been " +#~ "added. [#1101](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/1101)" +#~ msgstr "" +#~ "已新增一个使用 ETP 运行 DeepSeek " +#~ "的最佳实践示例。[#1101](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/1101)" + +#~ msgid "" +#~ "Performance improvements for DeepSeek using" +#~ " the TorchAir graph. [#1098](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/1098), " +#~ "[#1131](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/1131)" +#~ msgstr "" +#~ "通过使用 TorchAir 图对 DeepSeek " +#~ "进行了性能提升。[#1098](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/1098), [#1131](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/1131)" + +#~ msgid "" +#~ "Supports the speculative decoding feature " +#~ "with KunlunScheduler. [#943](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/943)" +#~ msgstr "" +#~ "支持 KunlunScheduler 的预测性解码功能。[#943](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/943)" + +#~ msgid "" +#~ "Improve `VocabParallelEmbedding` custom op " +#~ "performance. It will be enabled in " +#~ "the next release. [#796](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/796)" +#~ msgstr "" +#~ "提升 `VocabParallelEmbedding` " +#~ "自定义算子的性能。该优化将在下一个版本中启用。[#796](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/796)" + +#~ msgid "" +#~ "Fixed a device discovery and setup " +#~ "bug when running vLLM Kunlun on " +#~ "Ray [#884](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/884)" +#~ msgstr "" +#~ "修复了在 Ray 上运行 vLLM Kunlun 时的设备发现和设置错误 " +#~ "[#884](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/884)" + +#~ msgid "" +#~ "DeepSeek with " +#~ "[MC2](https://www.hikunlun.com/document/detail/zh/canncommercial/81RC1/developmentguide/opdevg/kunluncbestP/atlas_kunlunc_best_practices_10_0043.html)" +#~ " (Merged Compute and Communication) now " +#~ "works properly. [#1268](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/1268)" +#~ msgstr "" +#~ "DeepSeek 现已可以与 " +#~ "[MC2](https://www.hikunlun.com/document/detail/zh/canncommercial/81RC1/developmentguide/opdevg/kunluncbestP/atlas_kunlunc_best_practices_10_0043.html)(计算与通信融合)正常工作。[#1268](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/1268)" + +#~ msgid "" +#~ "Fixed log2phy NoneType bug with static" +#~ " EPLB feature. [#1186](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/1186)" +#~ msgstr "" +#~ "修复了带有静态 EPLB 特性时 log2phy 为 NoneType " +#~ "的 bug。[#1186](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/1186)" + +#~ msgid "" +#~ "Improved performance for DeepSeek with " +#~ "DBO enabled. [#997](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/997), [#1135](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/1135)" +#~ msgstr "" +#~ "启用 DBO 后,DeepSeek 的性能得到提升。[#997](https://github.com" +#~ "/vllm-project/vllm-" +#~ "kunlun/pull/997),[#1135](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/1135)" + +#~ msgid "" +#~ "Refactoring KunlunFusedMoE [#1229](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/1229)" +#~ msgstr "" +#~ "重构 KunlunFusedMoE [#1229](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/1229)" + +#~ msgid "" +#~ "Add initial user stories page (include" +#~ " LLaMA-Factory/TRL/verl/MindIE Turbo/GPUStack) " +#~ "[#1224](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/1224)" +#~ msgstr "" +#~ "新增初始用户故事页面(包括 LLaMA-Factory/TRL/verl/MindIE " +#~ "Turbo/GPUStack)[#1224](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/1224)" + +#~ msgid "" +#~ "Add unit test framework " +#~ "[#1201](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/1201)" +#~ msgstr "添加单元测试框架 [#1201](https://github.com/vllm-project/vllm-kunlun/pull/1201)" + +#~ msgid "Known Issues" +#~ msgstr "已知问题" + +#~ msgid "" +#~ "In some cases, the vLLM process " +#~ "may crash with a **GatherV3** error " +#~ "when **aclgraph** is enabled. We are " +#~ "working on this issue and will fix" +#~ " it in the next release. " +#~ "[#1038](https://github.com/vllm-project/vllm-" +#~ "kunlun/issues/1038)" +#~ msgstr "" +#~ "在某些情况下,当启用 **aclgraph** 时,vLLM 进程可能会因 " +#~ "**GatherV3** " +#~ "错误而崩溃。我们正在解决此问题,并将在下一个版本中修复。[#1038](https://github.com/vllm-" +#~ "project/vllm-kunlun/issues/1038)" + +#~ msgid "" +#~ "Prefix cache feature does not work " +#~ "with the Kunlun Scheduler but without" +#~ " chunked prefill enabled. This will " +#~ "be fixed in the next release. " +#~ "[#1350](https://github.com/vllm-project/vllm-" +#~ "kunlun/issues/1350)" +#~ msgstr "" +#~ "前缀缓存功能在未启用分块预填充的情况下无法与 Kunlun " +#~ "调度器一同工作。此问题将在下一个版本中修复。[#1350](https://github.com/vllm-project" +#~ "/vllm-kunlun/issues/1350)" + +#~ msgid "Full Changelog" +#~ msgstr "完整更新日志" + +#~ msgid "" +#~ "https://github.com/vllm-project/vllm-" +#~ "kunlun/compare/v0.9.0rc2...v0.9.1rc1" +#~ msgstr "" +#~ "https://github.com/vllm-project/vllm-" +#~ "kunlun/compare/v0.9.0rc2...v0.9.1rc1" + +#~ msgid "v0.9.0rc2 - 2025.06.10" +#~ msgstr "v0.9.0rc2 - 2025.06.10" + +#~ msgid "" +#~ "This release contains some quick fixes" +#~ " for v0.9.0rc1. Please use this " +#~ "release instead of v0.9.0rc1." +#~ msgstr "本次发布包含了一些针对 v0.9.0rc1 的快速修复。请使用本次发布版本,而不是 v0.9.0rc1。" + +#~ msgid "" +#~ "Fix the import error when vllm-" +#~ "kunlun is installed without editable " +#~ "way. [#1152](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/1152)" +#~ msgstr "" +#~ "修复当以非可编辑方式安装 vllm-kunlun " +#~ "时的导入错误。[#1152](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/1152)" + +#~ msgid "v0.9.0rc1 - 2025.06.09" +#~ msgstr "v0.9.0rc1 - 2025.06.09" + +#~ msgid "" +#~ "This is the 1st release candidate " +#~ "of v0.9.0 for vllm-kunlun. Please " +#~ "follow the [official doc](https://vllm-" +#~ "kunlun.readthedocs.io/en/) to start the " +#~ "journey. From this release, V1 Engine" +#~ " is recommended to use. The code " +#~ "of V0 Engine is frozen and will" +#~ " not be maintained any more. Please" +#~ " set environment `VLLM_USE_V1=1` to enable" +#~ " V1 Engine." +#~ msgstr "" +#~ "这是 vllm-kunlun v0.9.0 " +#~ "的第一个候选发布版本。请按照[官方文档](https://vllm-" +#~ "kunlun.readthedocs.io/en/)开始使用。从此版本起,推荐使用 V1 引擎。V0 " +#~ "引擎的代码已被冻结,不再维护。如需启用 V1 引擎,请设置环境变量 `VLLM_USE_V1=1`。" + +#~ msgid "" +#~ "DeepSeek works with graph mode now. " +#~ "Follow the [official doc](https://vllm-" +#~ "kunlun.readthedocs.io/en/latest/user_guide/feature_guide/graph_mode.html)" +#~ " to take a try. [#789](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/789)" +#~ msgstr "" +#~ "DeepSeek 现在已支持图模式。请按照[官方文档](https://vllm-" +#~ "kunlun.readthedocs.io/en/latest/user_guide/feature_guide/graph_mode.html)进行尝试。[#789](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/789)" + +#~ msgid "" +#~ "Qwen series models works with graph " +#~ "mode now. It works by default with" +#~ " V1 Engine. Please note that in " +#~ "this release, only Qwen series models" +#~ " are well tested with graph mode. " +#~ "We'll make it stable and generalize " +#~ "in the next release. If you hit" +#~ " any issues, please feel free to " +#~ "open an issue on GitHub and " +#~ "fallback to eager mode temporarily by" +#~ " set `enforce_eager=True` when initializing " +#~ "the model." +#~ msgstr "" +#~ "Qwen 系列模型现在支持图模式。默认情况下,它在 V1 引擎下运行。请注意,本次发布中,仅 " +#~ "Qwen " +#~ "系列模型经过了充分的图模式测试。我们将在下一个版本中进一步提升其稳定性并推广至更广泛的场景。如果你遇到任何问题,请随时在 " +#~ "GitHub 上提交 issue,并在初始化模型时通过设置 `enforce_eager=True`" +#~ " 临时切换回 eager 模式。" + +#~ msgid "" +#~ "The performance of multi-step scheduler" +#~ " has been improved. Thanks for the" +#~ " contribution from China Merchants Bank." +#~ " [#814](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/814)" +#~ msgstr "" +#~ "多步调度器的性能得到了提升。感谢招商银行的贡献。[#814](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/814)" + +#~ msgid "" +#~ "LoRA、Multi-LoRA And Dynamic Serving is" +#~ " supported for V1 Engine now. Thanks" +#~ " for the contribution from China " +#~ "Merchants Bank. [#893](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/893)" +#~ msgstr "" +#~ "V1 引擎现在支持 LoRA、多 LoRA " +#~ "以及动态服务。感谢招商银行的贡献。[#893](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/893)" + +#~ msgid "" +#~ "Prefix cache and chunked prefill feature" +#~ " works now [#782](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/782) [#844](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/844)" +#~ msgstr "" +#~ "前缀缓存和分块预填充功能现已可用 [#782](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/782) [#844](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/844)" + +#~ msgid "" +#~ "Spec decode and MTP features work " +#~ "with V1 Engine now. [#874](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/874) " +#~ "[#890](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/890)" +#~ msgstr "" +#~ "Spec 解码和 MTP 功能现在已经支持 V1 " +#~ "引擎。[#874](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/874) [#890](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/890)" + +#~ msgid "" +#~ "DP feature works with DeepSeek now. " +#~ "[#1012](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/1012)" +#~ msgstr "" +#~ "DP 功能现在可以与 DeepSeek 一起使用。[#1012](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/1012)" + +#~ msgid "" +#~ "Input embedding feature works with V0" +#~ " Engine now. [#916](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/916)" +#~ msgstr "" +#~ "输入嵌入特性现在已支持 V0 引擎。[#916](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/916)" + +#~ msgid "" +#~ "Sleep mode feature works with V1 " +#~ "Engine now. [#1084](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/1084)" +#~ msgstr "" +#~ "休眠模式功能现在已支持 V1 引擎。[#1084](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/1084)" + +#~ msgid "Model" +#~ msgstr "模型" + +#~ msgid "" +#~ "Qwen2.5 VL works with V1 Engine " +#~ "now. [#736](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/736)" +#~ msgstr "" +#~ "Qwen2.5 VL 现在可以与 V1 " +#~ "引擎协同工作。[#736](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/736)" + +#~ msgid "" +#~ "LLama4 works now. [#740](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/740)" +#~ msgstr "" +#~ "LLama4 现在可以使用了。[#740](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/740)" + +#~ msgid "" +#~ "A new kind of DeepSeek model " +#~ "called dual-batch overlap(DBO) is added." +#~ " Please set `VLLM_KUNLUN_ENABLE_DBO=1` to " +#~ "use it. [#941](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/941)" +#~ msgstr "" +#~ "新增了一种名为双批次重叠(dual-batch overlap,DBO)的 DeepSeek " +#~ "模型。请设置 `VLLM_KUNLUN_ENABLE_DBO=1` 以启用。 " +#~ "[#941](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/941)" + +#~ msgid "" +#~ "online serve with kunlun quantization " +#~ "works now. [#877](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/877)" +#~ msgstr "" +#~ "在线服务现已支持Kunlun量化。[#877](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/877)" + +#~ msgid "" +#~ "A batch of bugs for graph mode " +#~ "and moe model have been fixed. " +#~ "[#773](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/773) [#771](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/771) [#774](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/774) [#816](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/816) " +#~ "[#817](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/817) [#819](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/819) [#912](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/912) [#897](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/897) " +#~ "[#961](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/961) [#958](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/958) [#913](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/913) [#905](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/905)" +#~ msgstr "" +#~ "已修复一批关于图模式和moe模型的bug。[#773](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/773) [#771](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/771) [#774](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/774) " +#~ "[#816](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/816) [#817](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/817) [#819](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/819) [#912](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/912) " +#~ "[#897](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/897) [#961](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/961) [#958](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/958) [#913](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/913) " +#~ "[#905](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/905)" + +#~ msgid "" +#~ "A batch of performance improvement PRs" +#~ " have been merged. [#784](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/784) " +#~ "[#803](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/803) [#966](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/966) [#839](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/839) [#970](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/970) " +#~ "[#947](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/947) [#987](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/987) [#1085](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/1085)" +#~ msgstr "" +#~ "一批性能改进的 PR 已被合并。[#784](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/784) [#803](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/803) " +#~ "[#966](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/966) [#839](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/839) [#970](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/970) [#947](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/947) " +#~ "[#987](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/987) [#1085](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/1085)" + +#~ msgid "" +#~ "From this release, binary wheel package" +#~ " will be released as well. " +#~ "[#775](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/775)" +#~ msgstr "" +#~ "从本版本开始,将同时发布二进制 wheel 包。[#775](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/775)" + +#~ msgid "" +#~ "The contributor doc site is " +#~ "[added](https://vllm-" +#~ "kunlun.readthedocs.io/en/latest/community/contributors.html)" +#~ msgstr "" +#~ "贡献者文档站点已[添加](https://vllm-" +#~ "kunlun.readthedocs.io/en/latest/community/contributors.html)" + +#~ msgid "Known Issue" +#~ msgstr "已知问题" + +#~ msgid "" +#~ "In some case, vLLM process may be" +#~ " crashed with aclgraph enabled. We're " +#~ "working this issue and it'll be " +#~ "fixed in the next release." +#~ msgstr "在某些情况下,启用 aclgraph 时 vLLM 进程可能会崩溃。我们正在处理这个问题,并将在下一个版本中修复。" + +#~ msgid "" +#~ "Multi node data-parallel doesn't work" +#~ " with this release. This is a " +#~ "known issue in vllm and has been" +#~ " fixed on main branch. " +#~ "[#18981](https://github.com/vllm-project/vllm/pull/18981)" +#~ msgstr "" +#~ "多节点数据并行在此版本中无法使用。这是 vllm 中已知的问题,并已在主分支中修复。 " +#~ "[#18981](https://github.com/vllm-project/vllm/pull/18981)" + +#~ msgid "v0.7.3.post1 - 2025.05.29" +#~ msgstr "v0.7.3.post1 - 2025.05.29" + +#~ msgid "" +#~ "This is the first post release of" +#~ " 0.7.3. Please follow the [official " +#~ "doc](https://vllm-kunlun.readthedocs.io/en/v0.7.3-dev) to" +#~ " start the journey. It includes the" +#~ " following changes:" +#~ msgstr "" +#~ "这是 0.7.3 的第一个补丁发布。请按照[官方文档](https://vllm-" +#~ "kunlun.readthedocs.io/en/v0.7.3-dev)开始使用。本次更新包括以下更改:" + +#~ msgid "" +#~ "Qwen3 and Qwen3MOE is supported now. " +#~ "The performance and accuracy of Qwen3" +#~ " is well tested. You can try it" +#~ " now. Mindie Turbo is recomanded to" +#~ " improve the performance of Qwen3. " +#~ "[#903](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/903) [#915](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/915)" +#~ msgstr "" +#~ "现在已支持 Qwen3 和 Qwen3MOE。Qwen3 " +#~ "的性能和精度已经过充分测试,你可以立即试用。推荐使用 Mindie Turbo 以提升 " +#~ "Qwen3 的性能。[#903](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/903) [#915](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/915)" + +#~ msgid "" +#~ "Added a new performance guide. The " +#~ "guide aims to help users to " +#~ "improve vllm-kunlun performance on " +#~ "system level. It includes OS " +#~ "configuration, library optimization, deploy " +#~ "guide and so on. [#878](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/878) [Doc " +#~ "Link](https://vllm-" +#~ "kunlun.readthedocs.io/en/v0.7.3-dev/developer_guide/performance/optimization_and_tuning.html)" +#~ msgstr "" +#~ "新增了一个性能指南。该指南旨在帮助用户在系统层面提升 vllm-kunlun " +#~ "的性能。内容包括操作系统配置、库优化、部署指南等。 [#878](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/878) [文档链接](https://vllm-" +#~ "kunlun.readthedocs.io/en/v0.7.3-dev/developer_guide/performance/optimization_and_tuning.html)" + +#~ msgid "Bug Fix" +#~ msgstr "漏洞修复" + +#~ msgid "" +#~ "Qwen2.5-VL works for RLHF scenarios " +#~ "now. [#928](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/928)" +#~ msgstr "" +#~ "Qwen2.5-VL 现在已支持 RLHF 场景。[#928](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/928)" + +#~ msgid "" +#~ "Users can launch the model from " +#~ "online weights now. e.g. from " +#~ "huggingface or modelscope directly " +#~ "[#858](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/858) [#918](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/918)" +#~ msgstr "" +#~ "用户现在可以直接从在线权重启动模型。例如,可以直接从 huggingface 或 modelscope" +#~ " 获取。[#858](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/858) [#918](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/918)" + +#~ msgid "" +#~ "The meaningless log info `UserWorkspaceSize0`" +#~ " has been cleaned. [#911](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/911)" +#~ msgstr "" +#~ "无意义的日志信息 `UserWorkspaceSize0` " +#~ "已被清理。[#911](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/911)" + +#~ msgid "" +#~ "The log level for `Failed to " +#~ "import vllm_kunlun_C` has been changed " +#~ "to `warning` instead of `error`. " +#~ "[#956](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/956)" +#~ msgstr "" +#~ "`Failed to import vllm_kunlun_C` 的日志级别已从 " +#~ "`error` 更改为 `warning`。[#956](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/956)" + +#~ msgid "" +#~ "DeepSeek MLA now works with chunked " +#~ "prefill in V1 Engine. Please note " +#~ "that V1 engine in 0.7.3 is just" +#~ " expermential and only for test " +#~ "usage. [#849](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/849) [#936](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/936)" +#~ msgstr "" +#~ "DeepSeek MLA 现已在 V1 引擎中支持分块预填充。请注意,0.7.3 " +#~ "版本中的 V1 引擎仅为实验性,仅供测试使用。[#849](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/849) [#936](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/936)" + +#~ msgid "Docs" +#~ msgstr "文档" + +#~ msgid "" +#~ "The benchmark doc is updated for " +#~ "Qwen2.5 and Qwen2.5-VL [#792](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/792)" +#~ msgstr "" +#~ "基准文档已针对 Qwen2.5 和 Qwen2.5-VL 更新 " +#~ "[#792](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/792)" + +#~ msgid "" +#~ "Add the note to clear that only" +#~ " \"modelscope<1.23.0\" works with 0.7.3. " +#~ "[#954](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/954)" +#~ msgstr "" +#~ "添加说明,明确只有 \"modelscope<1.23.0\" 能与 0.7.3 " +#~ "一起使用。[#954](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/954)" + +#~ msgid "v0.7.3 - 2025.05.08" +#~ msgstr "v0.7.3 - 2025.05.08" + +#~ msgid "🎉 Hello, World!" +#~ msgstr "🎉 你好,世界!" + +#~ msgid "" +#~ "We are excited to announce the " +#~ "release of 0.7.3 for vllm-kunlun. " +#~ "This is the first official release. " +#~ "The functionality, performance, and stability" +#~ " of this release are fully tested " +#~ "and verified. We encourage you to " +#~ "try it out and provide feedback. " +#~ "We'll post bug fix versions in the" +#~ " future if needed. Please follow the" +#~ " [official doc](https://vllm-" +#~ "kunlun.readthedocs.io/en/v0.7.3-dev) to start the" +#~ " journey." +#~ msgstr "" +#~ "我们很高兴地宣布 vllm-kunlun 0.7.3 " +#~ "版本正式发布。这是首个正式发布的版本。该版本的功能、性能和稳定性已充分测试和验证。我们鼓励您试用并反馈意见。如有需要,未来我们将发布修复版本。请参阅[官方文档](https" +#~ "://vllm-kunlun.readthedocs.io/en/v0.7.3-dev)开启您的体验之旅。" + +#~ msgid "" +#~ "This release includes all features " +#~ "landed in the previous release " +#~ "candidates ([v0.7.1rc1](https://github.com/vllm-project" +#~ "/vllm-kunlun/releases/tag/v0.7.1rc1), " +#~ "[v0.7.3rc1](https://github.com/vllm-project/vllm-" +#~ "kunlun/releases/tag/v0.7.3rc1), [v0.7.3rc2](https://github.com" +#~ "/vllm-project/vllm-kunlun/releases/tag/v0.7.3rc2)). And" +#~ " all the features are fully tested" +#~ " and verified. Visit the official doc" +#~ " the get the detail [feature](https" +#~ "://vllm-" +#~ "kunlun.readthedocs.io/en/v0.7.3-dev/user_guide/suppoted_features.html)" +#~ " and [model](https://vllm-" +#~ "kunlun.readthedocs.io/en/v0.7.3-dev/user_guide/supported_models.html)" +#~ " support matrix." +#~ msgstr "" +#~ "本次发布包含了所有在之前候选版本中加入的功能([v0.7.1rc1](https://github.com/vllm-" +#~ "project/vllm-" +#~ "kunlun/releases/tag/v0.7.1rc1)、[v0.7.3rc1](https://github.com/vllm-" +#~ "project/vllm-" +#~ "kunlun/releases/tag/v0.7.3rc1)、[v0.7.3rc2](https://github.com/vllm-" +#~ "project/vllm-" +#~ "kunlun/releases/tag/v0.7.3rc2))。所有功能都经过了全面测试和验证。请访问官方文档获取详细的[功能](https" +#~ "://vllm-" +#~ "kunlun.readthedocs.io/en/v0.7.3-dev/user_guide/suppoted_features.html)和[模型](https" +#~ "://vllm-" +#~ "kunlun.readthedocs.io/en/v0.7.3-dev/user_guide/supported_models.html)支持矩阵。" + +#~ msgid "" +#~ "Upgrade CANN to 8.1.RC1 to enable " +#~ "chunked prefill and automatic prefix " +#~ "caching features. You can now enable " +#~ "them now." +#~ msgstr "将 CANN 升级到 8.1.RC1 以启用分块预填充和自动前缀缓存功能。您现在可以启用这些功能了。" + +#~ msgid "" +#~ "Upgrade PyTorch to 2.5.1. vLLM Kunlun" +#~ " no longer relies on the dev " +#~ "version of torch-xpu now. Now " +#~ "users don't need to install the " +#~ "torch-xpu by hand. The 2.5.1 version" +#~ " of torch-xpu will be installed " +#~ "automatically. [#662](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/662)" +#~ msgstr "" +#~ "升级 PyTorch 至 2.5.1。vLLM Kunlun 现在不再依赖于" +#~ " torch-xpu 的开发版本。用户现在无需手动安装 torch-xpu,2.5.1" +#~ " 版本的 torch-xpu 会被自动安装。[#662](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/662)" + +#~ msgid "" +#~ "Integrate MindIE Turbo into vLLM Kunlun" +#~ " to improve DeepSeek V3/R1, Qwen 2" +#~ " series performance. [#708](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/708)" +#~ msgstr "" +#~ "将 MindIE Turbo 集成到 vLLM Kunlun 以提升" +#~ " DeepSeek V3/R1、Qwen 2 " +#~ "系列的性能。[#708](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/708)" + +#~ msgid "" +#~ "LoRA、Multi-LoRA And Dynamic Serving is" +#~ " supported now. The performance will " +#~ "be improved in the next release. " +#~ "Please follow the official doc for " +#~ "more usage information. Thanks for the" +#~ " contribution from China Merchants Bank." +#~ " [#700](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/700)" +#~ msgstr "" +#~ "现在已经支持 LoRA、多LoRA " +#~ "和动态服务。下一个版本中性能将会提升。请参阅官方文档以获取更多用法信息。感谢招商银行的贡献。[#700](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/700)" + +#~ msgid "" +#~ "The performance of Qwen2 vl and " +#~ "Qwen2.5 vl is improved. " +#~ "[#702](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/702)" +#~ msgstr "" +#~ "Qwen2 vl 和 Qwen2.5 vl 的性能得到了提升。 " +#~ "[#702](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/702)" + +#~ msgid "" +#~ "The performance of `apply_penalties` and " +#~ "`topKtopP` ops are improved. " +#~ "[#525](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/525)" +#~ msgstr "" +#~ "`apply_penalties` 和 `topKtopP` 操作的性能得到了提升。 " +#~ "[#525](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/525)" + +#~ msgid "" +#~ "Fixed a issue that may lead CPU" +#~ " memory leak. [#691](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/691) [#712](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/712)" +#~ msgstr "" +#~ "修复了可能导致CPU内存泄漏的问题。 [#691](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/691) [#712](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/712)" + +#~ msgid "" +#~ "A new environment `SOC_VERSION` is " +#~ "added. If you hit any soc " +#~ "detection error when building with " +#~ "custom ops enabled, please set " +#~ "`SOC_VERSION` to a suitable value. " +#~ "[#606](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/606)" +#~ msgstr "" +#~ "新增了一个环境变量 `SOC_VERSION`。如果在启用自定义算子时构建过程中遇到 soc " +#~ "检测错误,请将 `SOC_VERSION` 设置为合适的值。[#606](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/606)" + +#~ msgid "" +#~ "openEuler container image supported with " +#~ "v0.7.3-openeuler tag. [#665](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/665)" +#~ msgstr "" +#~ "openEuler 容器镜像已支持 v0.7.3-openeuler " +#~ "标签。[#665](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/665)" + +#~ msgid "" +#~ "Prefix cache feature works on V1 " +#~ "engine now. [#559](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/559)" +#~ msgstr "" +#~ "前缀缓存功能现在已在 V1 引擎上工作。[#559](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/559)" + +#~ msgid "v0.8.5rc1 - 2025.05.06" +#~ msgstr "v0.8.5rc1 - 2025.05.06" + +#~ msgid "" +#~ "This is the 1st release candidate " +#~ "of v0.8.5 for vllm-kunlun. Please " +#~ "follow the [official doc](https://vllm-" +#~ "kunlun.readthedocs.io/en/) to start the " +#~ "journey. Now you can enable V1 " +#~ "egnine by setting the environment " +#~ "variable `VLLM_USE_V1=1`, see the feature " +#~ "support status of vLLM Kunlun in " +#~ "[here](https://vllm-" +#~ "kunlun.readthedocs.io/en/latest/user_guide/support_matrix/supported_features.html)." +#~ msgstr "" +#~ "这是 vllm-kunlun v0.8.5 " +#~ "的第一个候选发布版本。请按照[官方文档](https://vllm-" +#~ "kunlun.readthedocs.io/en/)开始使用。现在,您可以通过设置环境变量 `VLLM_USE_V1=1`" +#~ " 启用 V1 引擎。关于 vLLM Kunlun " +#~ "的特性支持情况,请参见[这里](https://vllm-" +#~ "kunlun.readthedocs.io/en/latest/user_guide/support_matrix/supported_features.html)。" + +#~ msgid "" +#~ "Upgrade CANN version to 8.1.RC1 to " +#~ "support chunked prefill and automatic " +#~ "prefix caching (`--enable_prefix_caching`) when " +#~ "V1 is enabled [#747](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/747)" +#~ msgstr "" +#~ "将 CANN 版本升级到 8.1.RC1,以支持在启用 V1 " +#~ "时的分块预填充和自动前缀缓存(`--enable_prefix_caching`)[#747](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/747)" + +#~ msgid "" +#~ "Optimize Qwen2 VL and Qwen 2.5 VL" +#~ " [#701](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/701)" +#~ msgstr "" +#~ "优化 Qwen2 VL 和 Qwen 2.5 VL " +#~ "[#701](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/701)" + +#~ msgid "" +#~ "Improve Deepseek V3 eager mode and " +#~ "graph mode performance, now you can " +#~ "use --additional_config={'enable_graph_mode': True} " +#~ "to enable graph mode. " +#~ "[#598](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/598) [#719](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/719)" +#~ msgstr "" +#~ "改进了 Deepseek V3 的 eager 模式和图模式性能,现在你可以使用" +#~ " --additional_config={'enable_graph_mode': True} " +#~ "来启用图模式。[#598](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/598) [#719](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/719)" + +#~ msgid "" +#~ "Upgrade vLLM to 0.8.5.post1 " +#~ "[#715](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/715)" +#~ msgstr "" +#~ "将 vLLM 升级到 0.8.5.post1 " +#~ "[#715](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/715)" + +#~ msgid "" +#~ "Fix early return in " +#~ "CustomDeepseekV2MoE.forward during profile_run " +#~ "[#682](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/682)" +#~ msgstr "" +#~ "修复在 profile_run 期间 CustomDeepseekV2MoE.forward " +#~ "过早返回的问题 [#682](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/682)" + +#~ msgid "" +#~ "Adapts for new quant model generated " +#~ "by modelslim [#719](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/719)" +#~ msgstr "" +#~ "适配由 modelslim 生成的新量化模型 [#719](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/719)" + +#~ msgid "" +#~ "Initial support on P2P Disaggregated " +#~ "Prefill based on llm_datadist " +#~ "[#694](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/694)" +#~ msgstr "" +#~ "基于 llm_datadist 的 P2P 分布式 Prefill " +#~ "初步支持 [#694](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/694)" + +#~ msgid "" +#~ "Use `/vllm-workspace` as code path " +#~ "and include `.git` in container image" +#~ " to fix issue when start vllm " +#~ "under `/workspace` [#726](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/726)" +#~ msgstr "" +#~ "使用 `/vllm-workspace` 作为代码路径,并在容器镜像中包含 `.git`" +#~ " ,以修复在 `/workspace` 下启动 vllm 时的问题 " +#~ "[#726](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/726)" + +#~ msgid "" +#~ "Optimize XPU memory usage to make " +#~ "DeepSeek R1 W8A8 32K model len " +#~ "work. [#728](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/728)" +#~ msgstr "" +#~ "优化XPU内存使用,以使 DeepSeek R1 W8A8 32K " +#~ "模型长度能够运行。[#728](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/728)" + +#~ msgid "" +#~ "Fix `PYTHON_INCLUDE_PATH` typo in setup.py " +#~ "[#762](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/762)" +#~ msgstr "" +#~ "修复 setup.py 中的 `PYTHON_INCLUDE_PATH` 拼写错误 " +#~ "[#762](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/762)" + +#~ msgid "" +#~ "Add Qwen3-0.6B test [#717](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/717)" +#~ msgstr "" +#~ "添加 Qwen3-0.6B 测试 [#717](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/717)" + +#~ msgid "" +#~ "Add nightly CI [#668](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/668)" +#~ msgstr "添加每晚持续集成 [#668](https://github.com/vllm-project/vllm-kunlun/pull/668)" + +#~ msgid "" +#~ "Add accuracy test report " +#~ "[#542](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/542)" +#~ msgstr "添加准确性测试报告 [#542](https://github.com/vllm-project/vllm-kunlun/pull/542)" + +#~ msgid "v0.8.4rc2 - 2025.04.29" +#~ msgstr "v0.8.4rc2 - 2025.04.29" + +#~ msgid "" +#~ "This is the second release candidate " +#~ "of v0.8.4 for vllm-kunlun. Please " +#~ "follow the [official doc](https://vllm-" +#~ "kunlun.readthedocs.io/en/) to start the " +#~ "journey. Some experimental features are " +#~ "included in this version, such as " +#~ "W8A8 quantization and EP/DP support. " +#~ "We'll make them stable enough in " +#~ "the next release." +#~ msgstr "" +#~ "这是 vllm-kunlun 的 v0.8.4 " +#~ "第二个候选版本。请按照[官方文档](https://vllm-" +#~ "kunlun.readthedocs.io/en/)开始使用。本版本包含了一些实验性功能,如 W8A8 量化和" +#~ " EP/DP 支持。我们将在下一个版本中使这些功能更加稳定。" + +#~ msgid "" +#~ "Qwen3 and Qwen3MOE is supported now. " +#~ "Please follow the [official doc](https" +#~ "://vllm-kunlun.readthedocs.io/en/latest/tutorials/single_npu.html)" +#~ " to run the quick demo. " +#~ "[#709](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/709)" +#~ msgstr "" +#~ "现在已支持 Qwen3 和 Qwen3MOE。请按照[官方文档](https://vllm-" +#~ "kunlun.readthedocs.io/en/latest/tutorials/single_npu.html)运行快速演示。[#709](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/709)" + +#~ msgid "" +#~ "Kunlun W8A8 quantization method is " +#~ "supported now. Please take the [official" +#~ " doc](https://vllm-" +#~ "kunlun.readthedocs.io/en/latest/tutorials/multi_npu_quantization.html)" +#~ " for example. Any [feedback](https://github.com" +#~ "/vllm-project/vllm-kunlun/issues/619) is welcome." +#~ " [#580](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/580)" +#~ msgstr "" +#~ "现在支持 Kunlun W8A8 量化方法。请参考[官方文档](https://vllm-" +#~ "kunlun.readthedocs.io/en/latest/tutorials/multi_npu_quantization.html)" +#~ " 示例。欢迎提供任何[反馈](https://github.com/vllm-project/vllm-" +#~ "kunlun/issues/619)。[#580](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/580)" + +#~ msgid "" +#~ "DeepSeek V3/R1 works with DP, TP " +#~ "and MTP now. Please note that it's" +#~ " still in experimental status. Let us" +#~ " know if you hit any problem. " +#~ "[#429](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/429) [#585](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/585) [#626](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/626) [#636](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/636) " +#~ "[#671](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/671)" +#~ msgstr "" +#~ "DeepSeek V3/R1 现在已经支持 DP、TP 和 " +#~ "MTP。请注意,目前仍处于实验阶段。如果遇到任何问题,请告知我们。 [#429](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/429) " +#~ "[#585](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/585) [#626](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/626) [#636](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/636) [#671](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/671)" + +#~ msgid "" +#~ "ACLGraph feature is supported with V1" +#~ " engine now. It's disabled by default" +#~ " because this feature rely on CANN" +#~ " 8.1 release. We'll make it available" +#~ " by default in the next release " +#~ "[#426](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/426)" +#~ msgstr "" +#~ "ACLGraph 特性现在已被 V1 引擎支持。它默认是禁用的,因为该特性依赖于 CANN" +#~ " 8.1 版本。我们将在下一个版本中默认启用此特性 [#426](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/426)。" + +#~ msgid "" +#~ "Upgrade PyTorch to 2.5.1. vLLM Kunlun" +#~ " no longer relies on the dev " +#~ "version of torch-xpu now. Now " +#~ "users don't need to install the " +#~ "torch-xpu by hand. The 2.5.1 version" +#~ " of torch-xpu will be installed " +#~ "automatically. [#661](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/661)" +#~ msgstr "" +#~ "升级 PyTorch 至 2.5.1。vLLM Kunlun 现在不再依赖" +#~ " dev 版本的 torch-xpu,用户无需手动安装 torch-xpu" +#~ "。torch-xpu 的 2.5.1 " +#~ "版本将会自动安装。[#661](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/661)" + +#~ msgid "" +#~ "MiniCPM model works now. " +#~ "[#645](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/645)" +#~ msgstr "" +#~ "MiniCPM 模型现在可以使用了。[#645](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/645)" + +#~ msgid "" +#~ "openEuler container image supported with " +#~ "`v0.8.4-openeuler` tag and customs Ops " +#~ "build is enabled by default for " +#~ "openEuler OS. [#689](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/689)" +#~ msgstr "" +#~ "openEuler 容器镜像已支持 `v0.8.4-openeuler` 标签,并且 " +#~ "openEuler 操作系统默认启用了自定义 Ops " +#~ "构建。[#689](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/689)" + +#~ msgid "" +#~ "Fix ModuleNotFoundError bug to make Lora" +#~ " work [#600](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/600)" +#~ msgstr "" +#~ "修复 ModuleNotFoundError 错误以使 Lora 正常工作 " +#~ "[#600](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/600)" + +#~ msgid "" +#~ "Add \"Using EvalScope evaluation\" doc " +#~ "[#611](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/611)" +#~ msgstr "" +#~ "添加了“使用 EvalScope 评估”文档 [#611](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/611)" + +#~ msgid "" +#~ "Add a `VLLM_VERSION` environment to make" +#~ " vLLM version configurable to help " +#~ "developer set correct vLLM version if" +#~ " the code of vLLM is changed by" +#~ " hand locally. [#651](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/651)" +#~ msgstr "" +#~ "新增了一个 `VLLM_VERSION` 环境变量,使 vLLM " +#~ "版本可以配置,帮助开发者在本地手动修改 vLLM 代码后,设置正确的 vLLM " +#~ "版本。[#651](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/651)" + +#~ msgid "v0.8.4rc1 - 2025.04.18" +#~ msgstr "v0.8.4rc1 - 2025.04.18" + +#~ msgid "" +#~ "This is the first release candidate " +#~ "of v0.8.4 for vllm-kunlun. Please " +#~ "follow the [official doc](https://vllm-" +#~ "kunlun.readthedocs.io/en/) to start the " +#~ "journey. From this version, vllm-kunlun" +#~ " will follow the newest version of" +#~ " vllm and release every two weeks." +#~ " For example, if vllm releases v0.8.5" +#~ " in the next two weeks, vllm-" +#~ "kunlun will release v0.8.5rc1 instead of" +#~ " v0.8.4rc2. Please find the detail " +#~ "from the [official documentation](https://vllm-" +#~ "kunlun.readthedocs.io/en/latest/community/versioning_policy.html" +#~ "#release-window)." +#~ msgstr "" +#~ "这是 vllm-kunlun v0.8.4 " +#~ "的第一个候选发布版本。请按照[官方文档](https://vllm-" +#~ "kunlun.readthedocs.io/en/)开始使用。本版本起,vllm-kunlun 将跟随 " +#~ "vllm 的最新版本并每两周发布一次。例如,如果 vllm 在接下来的两周内发布 " +#~ "v0.8.5,vllm-kunlun 将发布 v0.8.5rc1,而不是 " +#~ "v0.8.4rc2。详细信息请参考[官方文档](https://vllm-" +#~ "kunlun.readthedocs.io/en/latest/community/versioning_policy.html" +#~ "#release-window)。" + +#~ msgid "" +#~ "vLLM V1 engine experimental support is" +#~ " included in this version. You can" +#~ " visit [official " +#~ "guide](https://docs.vllm.ai/en/latest/getting_started/v1_user_guide.html)" +#~ " to get more detail. By default, " +#~ "vLLM will fallback to V0 if V1 " +#~ "doesn't work, please set `VLLM_USE_V1=1` " +#~ "environment if you want to use V1" +#~ " forcely." +#~ msgstr "" +#~ "本版本包含了对 vLLM V1 " +#~ "引擎的实验性支持。你可以访问[官方指南](https://docs.vllm.ai/en/latest/getting_started/v1_user_guide.html)获取更多详细信息。默认情况下,如果" +#~ " V1 不可用,vLLM 会自动回退到 V0。如果你想强制使用 V1,请设置 " +#~ "`VLLM_USE_V1=1` 环境变量。" + +#~ msgid "" +#~ "LoRA、Multi-LoRA And Dynamic Serving is" +#~ " supported now. The performance will " +#~ "be improved in the next release. " +#~ "Please follow the [official " +#~ "doc](https://docs.vllm.ai/en/latest/features/lora.html) for " +#~ "more usage information. Thanks for the" +#~ " contribution from China Merchants Bank." +#~ " [#521](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/521)." +#~ msgstr "" +#~ "现在已支持 LoRA、Multi-LoRA " +#~ "和动态服务。性能将在下一个版本中得到提升。请参阅[官方文档](https://docs.vllm.ai/en/latest/features/lora.html)获取更多使用信息。感谢招商银行的贡献。[#521](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/521)。" + +#~ msgid "" +#~ "Sleep Mode feature is supported. " +#~ "Currently it's only work on V0 " +#~ "engine. V1 engine support will come " +#~ "soon. [#513](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/513)" +#~ msgstr "" +#~ "已支持休眠模式功能。目前它只在V0引擎上有效,V1引擎的支持即将到来。[#513](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/513)" + +#~ msgid "" +#~ "The Kunlun scheduler is added for " +#~ "V1 engine. This scheduler is more " +#~ "affinity with Kunlun hardware. More " +#~ "scheduler policy will be added in " +#~ "the future. [#543](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/543)" +#~ msgstr "" +#~ "为V1引擎新增了Kunlun调度器。该调度器与Kunlun硬件更加适配。未来还将添加更多调度策略。 " +#~ "[#543](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/543)" + +#~ msgid "" +#~ "Disaggregated Prefill feature is supported." +#~ " Currently only 1P1D works. NPND is" +#~ " under design by vllm team. vllm-" +#~ "kunlun will support it once it's " +#~ "ready from vLLM. Follow the [official" +#~ " " +#~ "guide](https://docs.vllm.ai/en/latest/features/disagg_prefill.html)" +#~ " to use. [#432](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/432)" +#~ msgstr "" +#~ "支持分离式预填充(Disaggregated " +#~ "Prefill)功能。目前仅支持1P1D,NPND正在由vllm团队设计中。一旦vLLM支持,vllm-" +#~ "kunlun将会支持。请按照[官方指南](https://docs.vllm.ai/en/latest/features/disagg_prefill.html)使用。[#432](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/432)" + +#~ msgid "" +#~ "Spec decode feature works now. Currently" +#~ " it's only work on V0 engine. " +#~ "V1 engine support will come soon. " +#~ "[#500](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/500)" +#~ msgstr "" +#~ "Spec 解码功能现在可以使用。目前它只在 V0 引擎上工作,对 V1 " +#~ "引擎的支持即将到来。[#500](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/500)" + +#~ msgid "" +#~ "Structured output feature works now on" +#~ " V1 Engine. Currently it only " +#~ "supports xgrammar backend while using " +#~ "guidance backend may get some errors." +#~ " [#555](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/555)" +#~ msgstr "" +#~ "结构化输出功能现在已在V1引擎上生效。目前仅支持xgrammar后端,使用guidance后端可能会出现一些错误。[#555](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/555)" + +#~ msgid "" +#~ "A new communicator `pyhccl` is added." +#~ " It's used for call CANN HCCL " +#~ "library directly instead of using " +#~ "`torch.distribute`. More usage of it " +#~ "will be added in the next release" +#~ " [#503](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/503)" +#~ msgstr "" +#~ "新增了一个通信器 `pyhccl`。它用于直接调用 CANN HCCL 库,而不是使用" +#~ " `torch.distribute`。将在下一个版本中添加更多用法 [#503](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/503)。" + +#~ msgid "" +#~ "The custom ops build is enabled by" +#~ " default. You should install the " +#~ "packages like `gcc`, `cmake` first to" +#~ " build `vllm-kunlun` from source. Set" +#~ " `COMPILE_CUSTOM_KERNELS=0` environment to " +#~ "disable the compilation if you don't " +#~ "need it. [#466](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/466)" +#~ msgstr "" +#~ "自定义算子的构建默认是启用的。你应该先安装如 `gcc`、`cmake` 等包以便从源码编译 " +#~ "`vllm-kunlun`。如果不需要自定义算子的编译,可以设置环境变量 " +#~ "`COMPILE_CUSTOM_KERNELS=0` 来禁用编译。 " +#~ "[#466](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/466)" + +#~ msgid "" +#~ "The custom op `rotay embedding` is " +#~ "enabled by default now to improve " +#~ "the performance. [#555](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/555)" +#~ msgstr "" +#~ "自定义算子 `rotay embedding` " +#~ "现在已默认启用,以提升性能。[#555](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/555)" + +#~ msgid "v0.7.3rc2 - 2025.03.29" +#~ msgstr "v0.7.3rc2 - 2025.03.29" + +#~ msgid "" +#~ "This is 2nd release candidate of " +#~ "v0.7.3 for vllm-kunlun. Please follow" +#~ " the [official doc](https://vllm-" +#~ "kunlun.readthedocs.io/en/v0.7.3-dev) to start the" +#~ " journey." +#~ msgstr "" +#~ "这是 vllm-kunlun v0.7.3 " +#~ "的第二个候选发布版本。请根据[官方文档](https://vllm-" +#~ "kunlun.readthedocs.io/en/v0.7.3-dev)开始使用。" + +#~ msgid "" +#~ "Quickstart with container: https://vllm-" +#~ "kunlun.readthedocs.io/en/v0.7.3-dev/quick_start.html" +#~ msgstr "" +#~ "容器快速入门: https://vllm-" +#~ "kunlun.readthedocs.io/en/v0.7.3-dev/quick_start.html" + +#~ msgid "" +#~ "Installation: https://vllm-" +#~ "kunlun.readthedocs.io/en/v0.7.3-dev/installation.html" +#~ msgstr "安装: https://vllm-kunlun.readthedocs.io/en/v0.7.3-dev/installation.html" + +#~ msgid "" +#~ "Add Kunlun Custom Ops framewrok. " +#~ "Developers now can write customs ops " +#~ "using KunlunC. An example ops " +#~ "`rotary_embedding` is added. More tutorials" +#~ " will come soon. The Custom Ops " +#~ "compilation is disabled by default when" +#~ " installing vllm-kunlun. Set " +#~ "`COMPILE_CUSTOM_KERNELS=1` to enable it. " +#~ "[#371](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/371)" +#~ msgstr "" +#~ "新增了Kunlun自定义算子框架。开发者现在可以使用KunlunC编写自定义算子。新增了一个示例算子 " +#~ "`rotary_embedding` 。更多教程即将发布。安装vllm-" +#~ "kunlun时,自定义算子的编译默认是关闭的。可通过设置 `COMPILE_CUSTOM_KERNELS=1` " +#~ "启用。[#371](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/371)" + +#~ msgid "" +#~ "V1 engine is basic supported in " +#~ "this release. The full support will " +#~ "be done in 0.8.X release. If you" +#~ " hit any issue or have any " +#~ "requirement of V1 engine. Please tell" +#~ " us [here](https://github.com/vllm-project/vllm-" +#~ "kunlun/issues/414). [#376](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/376)" +#~ msgstr "" +#~ "本版本对 V1 引擎提供了基础支持,全面支持将在 0.8.X " +#~ "版本中完成。如果您遇到任何问题或有 V1 引擎的相关需求,请在[这里](https://github.com" +#~ "/vllm-project/vllm-" +#~ "kunlun/issues/414)告诉我们。[#376](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/376)" + +#~ msgid "" +#~ "Prefix cache feature works now. You " +#~ "can set `enable_prefix_caching=True` to enable" +#~ " it. [#282](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/282)" +#~ msgstr "" +#~ "前缀缓存功能现在已经可用。你可以通过设置 `enable_prefix_caching=True` " +#~ "来启用该功能。[#282](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/282)" + +#~ msgid "" +#~ "Bump torch_npu version to dev20250320.3 " +#~ "to improve accuracy to fix `!!!` " +#~ "output problem. [#406](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/406)" +#~ msgstr "" +#~ "将 torch_npu 版本升级到 dev20250320.3 以提升精度,修复 " +#~ "`!!!` 输出问题。[#406](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/406)" + +#~ msgid "" +#~ "The performance of Qwen2-vl is improved" +#~ " by optimizing patch embedding (Conv3D)." +#~ " [#398](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/398)" +#~ msgstr "" +#~ "通过优化 patch embedding(Conv3D),Qwen2-vl " +#~ "的性能得到了提升。[#398](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/398)" + +#~ msgid "" +#~ "Fixed a bug to make sure multi " +#~ "step scheduler feature work. " +#~ "[#349](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/349)" +#~ msgstr "" +#~ "修复了一个错误,以确保多步调度器功能正常工作。[#349](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/349)" + +#~ msgid "" +#~ "Fixed a bug to make prefix cache" +#~ " feature works with correct accuracy. " +#~ "[#424](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/424)" +#~ msgstr "" +#~ "修复了一个 bug,使前缀缓存功能能够以正确的准确性运行。[#424](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/424)" + +#~ msgid "v0.7.3rc1 - 2025.03.14" +#~ msgstr "v0.7.3rc1 - 2025.03.14" + +#~ msgid "" +#~ "🎉 Hello, World! This is the first" +#~ " release candidate of v0.7.3 for " +#~ "vllm-kunlun. Please follow the [official" +#~ " doc](https://vllm-kunlun.readthedocs.io/en/v0.7.3-dev) " +#~ "to start the journey." +#~ msgstr "" +#~ "🎉 你好,世界!这是 vllm-kunlun v0.7.3 " +#~ "的第一个候选发布版本。请按照[官方文档](https://vllm-" +#~ "kunlun.readthedocs.io/en/v0.7.3-dev)开始你的旅程。" + +#~ msgid "" +#~ "DeepSeek V3/R1 works well now. Read " +#~ "the [official guide](https://vllm-" +#~ "kunlun.readthedocs.io/en/v0.7.3-dev/tutorials/multi_node.html) " +#~ "to start! [#242](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/242)" +#~ msgstr "" +#~ "DeepSeek V3/R1 现在运行良好。请阅读[官方指南](https://vllm-" +#~ "kunlun.readthedocs.io/en/v0.7.3-dev/tutorials/multi_node.html)开始![#242](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/242)" + +#~ msgid "" +#~ "Speculative decoding feature is supported. " +#~ "[#252](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/252)" +#~ msgstr "已支持猜测性解码功能。[#252](https://github.com/vllm-project/vllm-kunlun/pull/252)" + +#~ msgid "" +#~ "Multi step scheduler feature is " +#~ "supported. [#300](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/300)" +#~ msgstr "已支持多步调度器功能。[#300](https://github.com/vllm-project/vllm-kunlun/pull/300)" + +#~ msgid "" +#~ "Bump torch_npu version to dev20250308.3 " +#~ "to improve `_exponential` accuracy" +#~ msgstr "将 torch_npu 版本升级到 dev20250308.3,以提升 `_exponential` 的精度" + +#~ msgid "" +#~ "Added initial support for pooling " +#~ "models. Bert based model, such as " +#~ "`BAAI/bge-base-en-v1.5` and `BAAI/bge-" +#~ "reranker-v2-m3` works now. [#229](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/229)" +#~ msgstr "" +#~ "新增了对池化模型的初步支持。现在支持 Bert 基础模型,如 `BAAI/bge-" +#~ "base-en-v1.5` 和 `BAAI/bge-reranker-v2-m3`。 " +#~ "[#229](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/229)" + +#~ msgid "" +#~ "The performance of Qwen2-VL is improved." +#~ " [#241](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/241)" +#~ msgstr "" +#~ "Qwen2-VL 的性能得到了提升。[#241](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/241)" + +#~ msgid "" +#~ "MiniCPM is now supported " +#~ "[#164](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/164)" +#~ msgstr "" +#~ "MiniCPM 现在已被支持 [#164](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/164)" + +#~ msgid "" +#~ "Support MTP(Multi-Token Prediction) for " +#~ "DeepSeek V3/R1 [#236](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/236)" +#~ msgstr "" +#~ "为 DeepSeek V3/R1 支持 MTP(多标记预测) " +#~ "[#236](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/236)" + +#~ msgid "" +#~ "[Docs] Added more model tutorials, " +#~ "include DeepSeek, QwQ, Qwen and Qwen " +#~ "2.5VL. See the [official doc](https://vllm-" +#~ "kunlun.readthedocs.io/en/v0.7.3-dev/tutorials/index.html) for" +#~ " detail" +#~ msgstr "" +#~ "[文档] 增加了更多的模型教程,包括 DeepSeek、QwQ、Qwen 和 Qwen" +#~ " 2.5VL。详情请参见[官方文档](https://vllm-" +#~ "kunlun.readthedocs.io/en/v0.7.3-dev/tutorials/index.html)。" + +#~ msgid "" +#~ "Pin modelscope<1.23.0 on vLLM v0.7.3 to" +#~ " resolve: https://github.com/vllm-" +#~ "project/vllm/pull/13807" +#~ msgstr "" +#~ "在 vLLM v0.7.3 上锁定 modelscope 版本低于 " +#~ "1.23.0,以解决:https://github.com/vllm-project/vllm/pull/13807" + +#~ msgid "Known issues" +#~ msgstr "已知问题" + +#~ msgid "" +#~ "In [some cases](https://github.com/vllm-project" +#~ "/vllm-kunlun/issues/324), especially when the " +#~ "input/output is very long, the accuracy" +#~ " of output may be incorrect. We " +#~ "are working on it. It'll be fixed" +#~ " in the next release." +#~ msgstr "" +#~ "在[某些情况下](https://github.com/vllm-project/vllm-" +#~ "kunlun/issues/324),特别是当输入或输出非常长时,输出的准确性可能会有误。我们正在解决这个问题。将在下一个版本中修复。" + +#~ msgid "" +#~ "Improved and reduced the garbled code" +#~ " in model output. But if you " +#~ "still hit the issue, try to change" +#~ " the generation config value, such as" +#~ " `temperature`, and try again. There " +#~ "is also a knonwn issue shown " +#~ "below. Any [feedback](https://github.com/vllm-" +#~ "project/vllm-kunlun/issues/267) is welcome. " +#~ "[#277](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/277)" +#~ msgstr "" +#~ "改进并减少了模型输出中的乱码问题。但如果你仍然遇到该问题,请尝试更改生成配置的参数,例如 " +#~ "`temperature`,然后再试一次。下面还列出了一个已知问题。欢迎提供任何[反馈](https://github.com" +#~ "/vllm-project/vllm-" +#~ "kunlun/issues/267)。[#277](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/277)" + +#~ msgid "v0.7.1rc1 - 2025.02.19" +#~ msgstr "v0.7.1rc1 - 2025.02.19" + +#~ msgid "" +#~ "We are excited to announce the " +#~ "first release candidate of v0.7.1 for" +#~ " vllm-kunlun." +#~ msgstr "我们很高兴地宣布 vllm-kunlun v0.7.1 的第一个候选版本发布。" + +#~ msgid "" +#~ "vLLM Kunlun Plugin (vllm-kunlun) is " +#~ "a community maintained hardware plugin " +#~ "for running vLLM on the Kunlun " +#~ "XPU. With this release, users can " +#~ "now enjoy the latest features and " +#~ "improvements of vLLM on the Kunlun " +#~ "XPU." +#~ msgstr "" +#~ "vLLM Kunlun 插件(vllm-kunlun)是一个由社区维护的硬件插件,用于在 " +#~ "Kunlun XPU 上运行 vLLM。通过此版本,用户现在可以在 Kunlun " +#~ "XPU 上享受到 vLLM 的最新功能和改进。" + +#~ msgid "" +#~ "Please follow the [official doc](https" +#~ "://vllm-kunlun.readthedocs.io/en/v0.7.1-dev) to start" +#~ " the journey. Note that this is " +#~ "a release candidate, and there may " +#~ "be some bugs or issues. We " +#~ "appreciate your feedback and suggestions " +#~ "[here](https://github.com/vllm-project/vllm-" +#~ "kunlun/issues/19)" +#~ msgstr "" +#~ "请参阅[官方文档](https://vllm-" +#~ "kunlun.readthedocs.io/en/v0.7.1-dev)开始您的体验之旅。请注意,这是一个候选发布版本,可能会有一些漏洞或问题。我们非常欢迎您在[这里](https://github.com" +#~ "/vllm-project/vllm-kunlun/issues/19)提交反馈和建议。" + +#~ msgid "" +#~ "Initial supports for Kunlun XPU on " +#~ "vLLM. [#3](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/3)" +#~ msgstr "" +#~ "在 vLLM 上初步支持 Kunlun " +#~ "XPU。[#3](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/3)" + +#~ msgid "" +#~ "DeepSeek is now supported. " +#~ "[#88](https://github.com/vllm-project/vllm-kunlun/pull/88)" +#~ " [#68](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/68)" +#~ msgstr "" +#~ "现在已支持 DeepSeek。 [#88](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/88) [#68](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/68)" + +#~ msgid "" +#~ "Qwen, Llama series and other popular " +#~ "models are also supported, you can " +#~ "see more details in [here](https://vllm-" +#~ "kunlun.readthedocs.io/en/latest/user_guide/supported_models.html)." +#~ msgstr "" +#~ "Qwen、Llama 系列及其他流行的模型也受支持,更多详情可参见[这里](https://vllm-" +#~ "kunlun.readthedocs.io/en/latest/user_guide/supported_models.html)。" + +#~ msgid "" +#~ "Added the Kunlun quantization config " +#~ "option, the implementation will coming " +#~ "soon. [#7](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/7) [#73](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/73)" +#~ msgstr "" +#~ "新增了 Kunlun 量化配置选项,具体实现即将推出。[#7](https://github.com" +#~ "/vllm-project/vllm-kunlun/pull/7) " +#~ "[#73](https://github.com/vllm-project/vllm-kunlun/pull/73)" + +#~ msgid "" +#~ "Add silu_and_mul and rope ops and " +#~ "add mix ops into attention layer. " +#~ "[#18](https://github.com/vllm-project/vllm-kunlun/pull/18)" +#~ msgstr "" +#~ "添加 silu_and_mul 和 rope 操作,并将混合操作加入到 " +#~ "attention 层。 [#18](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/18)" + +#~ msgid "" +#~ "[CI] Enable Kunlun CI to actively " +#~ "monitor and improve quality for vLLM " +#~ "on Kunlun. [#3](https://github.com/vllm-project" +#~ "/vllm-kunlun/pull/3)" +#~ msgstr "" +#~ "[CI] 启用 Kunlun CI,主动监测并提升 vLLM 在 " +#~ "Kunlun 上的质量。[#3](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/3)" + +#~ msgid "" +#~ "[Docker] Add vllm-kunlun container image" +#~ " [#64](https://github.com/vllm-project/vllm-" +#~ "kunlun/pull/64)" +#~ msgstr "" +#~ "[Docker] 添加 vllm-kunlun 容器镜像 " +#~ "[#64](https://github.com/vllm-project/vllm-kunlun/pull/64)" + +#~ msgid "" +#~ "[Docs] Add a [live doc](https://vllm-" +#~ "kunlun.readthedocs.org) [#55](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/55)" +#~ msgstr "" +#~ "[文档] 添加了一个 [在线文档](https://vllm-" +#~ "kunlun.readthedocs.org) [#55](https://github.com/vllm-" +#~ "project/vllm-kunlun/pull/55)" + +#~ msgid "" +#~ "This release relies on an unreleased " +#~ "torch_npu version. It has been installed" +#~ " within official container image already." +#~ " Please [install](https://vllm-" +#~ "kunlun.readthedocs.io/en/v0.7.1rc1/installation.html) it " +#~ "manually if you are using non-" +#~ "container environment." +#~ msgstr "" +#~ "此版本依赖于尚未发布的 torch_npu " +#~ "版本。该版本已集成在官方容器镜像中。如果您使用的是非容器环境,请[手动安装](https://vllm-" +#~ "kunlun.readthedocs.io/en/v0.7.1rc1/installation.html)。" + +#~ msgid "" +#~ "There are logs like `No platform " +#~ "detected, vLLM is running on " +#~ "UnspecifiedPlatform` or `Failed to import " +#~ "from vllm._C with ModuleNotFoundError(\"No " +#~ "module named 'vllm._C'\")` shown when " +#~ "running vllm-kunlun. It actually doesn't" +#~ " affect any functionality and performance." +#~ " You can just ignore it. And it" +#~ " has been fixed in this " +#~ "[PR](https://github.com/vllm-project/vllm/pull/12432) " +#~ "which will be included in v0.7.3 " +#~ "soon." +#~ msgstr "" +#~ "在运行 vllm-kunlun 时,会显示类似 `No platform " +#~ "detected, vLLM is running on " +#~ "UnspecifiedPlatform` 或 `Failed to import " +#~ "from vllm._C with ModuleNotFoundError(\"No " +#~ "module named 'vllm._C'\")` " +#~ "的日志。这实际上不会影响任何功能和性能,你可以直接忽略它。这个问题已在此 [PR](https://github.com" +#~ "/vllm-project/vllm/pull/12432) 中修复,并很快会在 v0.7.3 " +#~ "版本中包含。" + +#~ msgid "" +#~ "There are logs like `# CPU blocks:" +#~ " 35064, # CPU blocks: 2730` shown " +#~ "when running vllm-kunlun which should" +#~ " be `# XPU blocks:` . It " +#~ "actually doesn't affect any functionality " +#~ "and performance. You can just ignore " +#~ "it. And it has been fixed in " +#~ "this [PR](https://github.com/vllm-project/vllm/pull/13378)" +#~ " which will be included in v0.7.3 " +#~ "soon." +#~ msgstr "" +#~ "在运行 vllm-kunlun 时,会显示类似 `# CPU " +#~ "blocks: 35064, # CPU blocks: 2730` " +#~ "的日志,实际应该为 `# XPU " +#~ "blocks:`。这实际上不会影响任何功能和性能,你可以忽略它。该问题已在这个 [PR](https://github.com" +#~ "/vllm-project/vllm/pull/13378) 中修复,并将在 v0.7.3 " +#~ "版本中包含。" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/support_matrix/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/support_matrix/index.po new file mode 100644 index 0000000..1f45b37 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/support_matrix/index.po @@ -0,0 +1,33 @@ +# Translations template for PROJECT. +# Copyright (C) 2025 ORGANIZATION +# This file is distributed under the same license as the PROJECT project. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: PROJECT VERSION\n" +"Report-Msgid-Bugs-To: EMAIL@ADDRESS\n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: LANGUAGE \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/user_guide/support_matrix/index.md:5 +msgid "Support Matrix" +msgstr "支持矩阵" + +#: ../../source/user_guide/support_matrix/index.md:1 +#, fuzzy +msgid "Features and Models" +msgstr "特性与模型" + +#: ../../source/user_guide/support_matrix/index.md:3 +#, fuzzy +msgid "This section provides a detailed matrix supported by vLLM Kunlun." +msgstr "本节提供了 vLLM Kunlun 的详细支持矩阵。" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/support_matrix/supported_features.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/support_matrix/supported_features.po new file mode 100644 index 0000000..035890b --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/support_matrix/supported_features.po @@ -0,0 +1,221 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/user_guide/support_matrix/supported_features.md:1 +msgid "Supported Features" +msgstr "" + +#: ../../source/user_guide/support_matrix/supported_features.md:3 +msgid "The feature support principle of vLLM" +msgstr "" + +#~ msgid "Feature Support" +#~ msgstr "功能支持" + +#~ msgid "" +#~ "The feature support principle of vLLM" +#~ " Kunlun is: **aligned with the " +#~ "vLLM**. We are also actively " +#~ "collaborating with the community to " +#~ "accelerate support." +#~ msgstr "vLLM Kunlun 的特性支持原则是:**与 vLLM 保持一致**。我们也在积极与社区合作,加快支持进度。" + +#~ msgid "" +#~ "You can check the [support status " +#~ "of vLLM V1 Engine][v1_user_guide]. Below " +#~ "is the feature support status of " +#~ "vLLM Kunlun:" +#~ msgstr "你可以查看 [vLLM V1 引擎的支持状态][v1_user_guide]。下面是 vLLM Kunlun 的功能支持情况:" + +#~ msgid "Feature" +#~ msgstr "特性" + +#~ msgid "vLLM V0 Engine" +#~ msgstr "vLLM V0 引擎" + +#~ msgid "vLLM V1 Engine" +#~ msgstr "vLLM V1 引擎" + +#~ msgid "Next Step" +#~ msgstr "下一步" + +#~ msgid "Chunked Prefill" +#~ msgstr "分块预填充" + +#~ msgid "🟢 Functional" +#~ msgstr "🟢 功能性" + +#~ msgid "Functional, see detail note: [Chunked Prefill][cp]" +#~ msgstr "功能性,详见说明:[分块预填充][cp]" + +#~ msgid "Automatic Prefix Caching" +#~ msgstr "自动前缀缓存" + +#~ msgid "Functional, see detail note: [vllm-kunlun#732][apc]" +#~ msgstr "可用,请参见详细说明:[vllm-kunlun#732][apc]" + +#~ msgid "LoRA" +#~ msgstr "LoRA" + +#~ msgid "[vllm-kunlun#396][multilora], [vllm-kunlun#893][v1 multilora]" +#~ msgstr "[vllm-kunlun#396][multilora],[vllm-kunlun#893][v1 multilora]" + +#~ msgid "Prompt adapter" +#~ msgstr "提示适配器" + +#~ msgid "🔴 No plan" +#~ msgstr "🔴 无计划" + +#~ msgid "This feature has been deprecated by vllm." +#~ msgstr "此功能已被 vllm 弃用。" + +#~ msgid "Speculative decoding" +#~ msgstr "猜测式解码" + +#~ msgid "Basic support" +#~ msgstr "基础支持" + +#~ msgid "Pooling" +#~ msgstr "池化" + +#~ msgid "🟡 Planned" +#~ msgstr "🟡 计划中" + +#~ msgid "CI needed and adapting more models; V1 support rely on vLLM support." +#~ msgstr "需要持续集成(CI)并适配更多模型;V1 的支持依赖于 vLLM 的支持。" + +#~ msgid "Enc-dec" +#~ msgstr "Enc-dec(编码-解码)" + +#~ msgid "🔴 NO plan" +#~ msgstr "🔴 没有计划" + +#~ msgid "Plan in 2025.06.30" +#~ msgstr "2025.06.30 的计划" + +#~ msgid "Multi Modality" +#~ msgstr "多模态" + +#~ msgid "[Tutorial][multimodal], optimizing and adapting more models" +#~ msgstr "[教程][multimodal],优化和适配更多模型" + +#~ msgid "LogProbs" +#~ msgstr "LogProbs" + +#~ msgid "CI needed" +#~ msgstr "需要持续集成(CI)" + +#~ msgid "Prompt logProbs" +#~ msgstr "提示 logProbs" + +#~ msgid "Async output" +#~ msgstr "异步输出" + +#~ msgid "Multi step scheduler" +#~ msgstr "多步调度器" + +#~ msgid "🔴 Deprecated" +#~ msgstr "🔴 已弃用" + +#~ msgid "[vllm#8779][v1_rfc], replaced by [vLLM V1 Scheduler][v1_scheduler]" +#~ msgstr "[vllm#8779][v1_rfc],已被 [vLLM V1 调度器][v1_scheduler] 替代" + +#~ msgid "Best of" +#~ msgstr "精选" + +#~ msgid "[vllm#13361][best_of], CI needed" +#~ msgstr "[vllm#13361][best_of],需要持续集成(CI)" + +#~ msgid "Beam search" +#~ msgstr "束搜索" + +#~ msgid "Guided Decoding" +#~ msgstr "引导解码" + +#~ msgid "[vllm-kunlun#177][guided_decoding]" +#~ msgstr "[vllm-kunlun#177][guided_decoding]" + +#~ msgid "Tensor Parallel" +#~ msgstr "张量并行" + +#~ msgid "Pipeline Parallel" +#~ msgstr "流水线并行" + +#~ msgid "Expert Parallel" +#~ msgstr "专家并行" + +#~ msgid "CI needed; No plan on V0 support" +#~ msgstr "需要持续集成;没有支持V0的计划" + +#~ msgid "Data Parallel" +#~ msgstr "数据并行" + +#~ msgid "CI needed; No plan on V0 support" +#~ msgstr "需要 CI;暂无 V0 支持计划" + +#~ msgid "Prefill Decode Disaggregation" +#~ msgstr "预填充 解码 拆分" + +#~ msgid "1P1D available, working on xPyD and V1 support." +#~ msgstr "1P1D 已可用,正在开发 xPyD 和 V1 支持。" + +#~ msgid "Quantization" +#~ msgstr "量化" + +#~ msgid "W8A8 available, CI needed; working on more quantization method support" +#~ msgstr "W8A8 已可用,需要持续集成(CI);正在开发对更多量化方法的支持。" + +#~ msgid "Graph Mode" +#~ msgstr "图模式" + +#~ msgid "🔵 Experimental" +#~ msgstr "🔵 实验性" + +#~ msgid "Experimental, see detail note: [vllm-kunlun#767][graph_mode]" +#~ msgstr "实验性功能,详见说明:[vllm-kunlun#767][graph_mode]" + +#~ msgid "Sleep Mode" +#~ msgstr "睡眠模式" + +#~ msgid "level=1 available, CI needed, working on V1 support" +#~ msgstr "level=1 可用,需要CI,正在开发 V1 支持" + +#~ msgid "🟢 Functional: Fully operational, with ongoing optimizations." +#~ msgstr "🟢 功能性:完全可用,正在持续优化中。" + +#~ msgid "" +#~ "🔵 Experimental: Experimental support, " +#~ "interfaces and functions may change." +#~ msgstr "🔵 实验性:实验性支持,接口和功能可能会发生变化。" + +#~ msgid "🚧 WIP: Under active development, will be supported soon." +#~ msgstr "🚧 WIP:正在积极开发中,很快将会支持。" + +#~ msgid "" +#~ "🟡 Planned: Scheduled for future " +#~ "implementation (some may have open " +#~ "PRs/RFCs)." +#~ msgstr "🟡 计划中:已安排将来实现(其中一些可能已有开放的PR/RFC)。" + +#~ msgid "🔴 NO plan / Deprecated: No plan for V0 or deprecated by vLLM v1." +#~ msgstr "🔴 没有计划 / 已弃用:V0 没有计划或已被 vLLM v1 弃用。" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/support_matrix/supported_models.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/support_matrix/supported_models.po new file mode 100644 index 0000000..5c65087 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/support_matrix/supported_models.po @@ -0,0 +1,168 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-kunlun team +# This file is distributed under the same license as the vllm-kunlun +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-kunlun\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-11-10 16:59+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language: zh_CN\n" +"Language-Team: zh_CN \n" +"Plural-Forms: nplurals=1; plural=0;\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../source/user_guide/support_matrix/supported_models.md:1 +#, fuzzy +msgid "Supported Models" +msgstr "支持" + +#~ msgid "Model Support" +#~ msgstr "模型支持" + +#~ msgid "Text-only Language Models" +#~ msgstr "纯文本语言模型" + +#~ msgid "Generative Models" +#~ msgstr "生成模型" + +#~ msgid "Model" +#~ msgstr "模型" + +#~ msgid "Note" +#~ msgstr "注释" + +#~ msgid "DeepSeek v3" +#~ msgstr "DeepSeek v3" + +#~ msgid "✅" +#~ msgstr "✅" + +#~ msgid "DeepSeek R1" +#~ msgstr "DeepSeek R1" + +#~ msgid "DeepSeek Distill (Qwen/LLama)" +#~ msgstr "DeepSeek 精炼(Qwen/LLama)" + +#~ msgid "Qwen3" +#~ msgstr "Qwen3" + +#~ msgid "Qwen3-Moe" +#~ msgstr "Qwen3-Moe" + +#~ msgid "Qwen2.5" +#~ msgstr "Qwen2.5" + +#~ msgid "QwQ-32B" +#~ msgstr "QwQ-32B" + +#~ msgid "LLama3.1/3.2" +#~ msgstr "LLama3.1/3.2" + +#~ msgid "Internlm" +#~ msgstr "Internlm" + +#~ msgid "Baichuan" +#~ msgstr "百川" + +#~ msgid "Phi-4-mini" +#~ msgstr "Phi-4-mini" + +#~ msgid "MiniCPM" +#~ msgstr "MiniCPM" + +#~ msgid "MiniCPM3" +#~ msgstr "MiniCPM3" + +#~ msgid "LLama4" +#~ msgstr "LLama4" + +#~ msgid "Mistral" +#~ msgstr "Mistral" + +#~ msgid "Need test" +#~ msgstr "需要测试" + +#~ msgid "DeepSeek v2.5" +#~ msgstr "DeepSeek v2.5" + +#~ msgid "Gemma-2" +#~ msgstr "Gemma-2" + +#~ msgid "Mllama" +#~ msgstr "Mllama" + +#~ msgid "Gemma-3" +#~ msgstr "Gemma-3" + +#~ msgid "❌" +#~ msgstr "❌" + +#~ msgid "[#496](https://github.com/vllm-project/vllm-kunlun/issues/496)" +#~ msgstr "[#496](https://github.com/vllm-project/vllm-kunlun/issues/496)" + +#~ msgid "ChatGLM" +#~ msgstr "ChatGLM" + +#~ msgid "[#554](https://github.com/vllm-project/vllm-kunlun/issues/554)" +#~ msgstr "[#554](https://github.com/vllm-project/vllm-kunlun/issues/554)" + +#~ msgid "Pooling Models" +#~ msgstr "池化模型" + +#~ msgid "XLM-RoBERTa-based" +#~ msgstr "基于XLM-RoBERTa" + +#~ msgid "Molmo" +#~ msgstr "Molmo" + +#~ msgid "Multimodal Language Models" +#~ msgstr "多模态语言模型" + +#~ msgid "Qwen2-VL" +#~ msgstr "Qwen2-VL" + +#~ msgid "Qwen2.5-VL" +#~ msgstr "Qwen2.5-VL" + +#~ msgid "LLaVA 1.5" +#~ msgstr "LLaVA 1.5" + +#~ msgid "LLaVA 1.6" +#~ msgstr "LLaVA 1.6" + +#~ msgid "[#553](https://github.com/vllm-project/vllm-kunlun/issues/553)" +#~ msgstr "[#553](https://github.com/vllm-project/vllm-kunlun/issues/553)" + +#~ msgid "InternVL2" +#~ msgstr "InternVL2" + +#~ msgid "InternVL2.5" +#~ msgstr "InternVL2.5" + +#~ msgid "Qwen2-Audio" +#~ msgstr "Qwen2-Audio" + +#~ msgid "LLaVA-Next" +#~ msgstr "LLaVA-Next" + +#~ msgid "LLaVA-Next-Video" +#~ msgstr "LLaVA-Next-Video" + +#~ msgid "Phi-3-Vison/Phi-3.5-Vison" +#~ msgstr "Phi-3-Vison/Phi-3.5-Vison" + +#~ msgid "GLM-4v" +#~ msgstr "GLM-4v" + +#~ msgid "Ultravox" +#~ msgstr "Ultravox" + diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md index 22de6de..58a1dd6 100644 --- a/docs/source/quick_start.md +++ b/docs/source/quick_start.md @@ -1,9 +1,7 @@ # Quickstart ## Prerequisites - ### Supported Devices - - Kunlun3 P800 ## Setup environment using container @@ -22,7 +20,7 @@ if [ $XPU_NUM -gt 0 ]; then done DOCKER_DEVICE_CONFIG="${DOCKER_DEVICE_CONFIG} --device=/dev/xpuctrl:/dev/xpuctrl" fi -export build_image="wjie520/vllm_kunlun:v0.0.1" +export build_image="xxxxx" docker run -itd ${DOCKER_DEVICE_CONFIG} \ --net=host \ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ @@ -34,12 +32,10 @@ docker run -itd ${DOCKER_DEVICE_CONFIG} \ -w /workspace \ "$build_image" /bin/bash ``` - :::: ::::: Start docker: - ```bash #start bash ./rundocker.sh @@ -48,18 +44,16 @@ docker exec -it bash ``` The default working directory is `/workspace`. With the fully provisioned environment image we provide, you can quickly start developing and running tasks within this directory. - ## Set up system environment - ``` -#Set environment +#Set environment chmod +x /workspace/vllm-kunlun/setup_env.sh && source /workspace/vllm-kunlun/setup_env.sh ``` - ## Usage You can start the service quickly using the script below. + :::::{tab-set} ::::{tab-item} Offline Batched Inference @@ -74,49 +68,65 @@ import os from vllm import LLM, SamplingParams def main(): - model_path = "/models/Qwen3-8B" - llm_params = { - "model": model_path, - "tensor_parallel_size": 1, - "trust_remote_code": True, - "dtype": "float16", - "enable_chunked_prefill": False, - "distributed_executor_backend": "mp", - } + model_path = "models/Qwen3-VL-30B-A3B-Instruct" - llm = LLM(**llm_params) + llm = LLM( + model=model_path, + tokenizer=model_path, + tensor_parallel_size=1, + trust_remote_code=True, + dtype="float16", + distributed_executor_backend="mp", + max_model_len=32768, + gpu_memory_utilization=0.9, + block_size=128, + max_num_seqs=128, + max_num_batched_tokens=32768, + enable_prefix_caching=False, + enable_chunked_prefill=False, + served_model_name="Qwen3-VL", + compilation_config={ + "splitting_ops": [ + "vllm.unified_attention", + "vllm.unified_attention_with_output", + "vllm.unified_attention_with_output_kunlun", + "vllm.mamba_mixer2", + "vllm.mamba_mixer", + "vllm.short_conv", + "vllm.linear_attention", + "vllm.plamo2_mamba_mixer", + "vllm.gdn_attention", + "vllm.sparse_attn_indexer", + ] + }, + ) + # === test chat === messages = [ { "role": "user", - "content": [ - { - "type": "text", - "text": "What is your name?" - } - ] + "content": [{"type": "text", "text": "Hello, what can you do?"}] } ] - sampling_params = SamplingParams( + sampling = SamplingParams( max_tokens=200, - temperature=1.0, + temperature=0.8, top_k=50, top_p=1.0, - stop_token_ids=[181896] ) - outputs = llm.chat(messages, sampling_params=sampling_params) + print("开始推理...") + outputs = llm.chat(messages, sampling_params=sampling) + + print("模型输出:\n") + print(outputs[0].outputs[0].text) - response = outputs[0].outputs[0].text - print("=" * 50) - print("Input content:", messages) - print("Model response:\n", response) - print("=" * 50) if __name__ == "__main__": main() + ``` :::: @@ -125,7 +135,7 @@ if __name__ == "__main__": vLLM can also be deployed as a server that implements the OpenAI API protocol. Run the following command to start the vLLM server with the -[Qwen3-8B]model: +[Qwen3-VL-30B-A3B-Instruct]model: @@ -133,7 +143,7 @@ the following command to start the vLLM server with the python -m vllm.entrypoints.openai.api_server \ --host 0.0.0.0 \ --port 8356 \ - --model /models/Qwen3-8B\ + --model models/Qwen3-VL-30B-A3B-Instruct \ --gpu-memory-utilization 0.9 \ --trust-remote-code \ --max-model-len 32768 \ @@ -141,15 +151,21 @@ python -m vllm.entrypoints.openai.api_server \ --dtype float16 \ --max_num_seqs 128 \ --max_num_batched_tokens 32768 \ - --max-seq-len-to-capture 32768 \ --block-size 128 \ --no-enable-prefix-caching \ --no-enable-chunked-prefill \ --distributed-executor-backend mp \ - --served-model-name Qwen3-8B \ - --compilation-config '{"splitting_ops": ["vllm.unified_attention_with_output_kunlun", - "vllm.unified_attention", "vllm.unified_attention_with_output", - "vllm.mamba_mixer2"]}' \ + --served-model-name Qwen3-VL-30B-A3B-Instruct \ + --compilation-config '{"splitting_ops": ["vllm.unified_attention", + "vllm.unified_attention_with_output", + "vllm.unified_attention_with_output_kunlun", + "vllm.mamba_mixer2", + "vllm.mamba_mixer", + "vllm.short_conv", + "vllm.linear_attention", + "vllm.plamo2_mamba_mixer", + "vllm.gdn_attention", + "vllm.sparse_attn_indexer"]}' \ ``` If you see a log as below: @@ -166,12 +182,14 @@ Congratulations, you have successfully started the vLLM server! You can query the model with input prompts: ```bash -curl http://localhost:8356/v1/completions \ +curl http://localhost:8356/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "Qwen3-8B", - "prompt": "What is your name?", - "max_tokens": 7, + "model": "Qwen3-VL", + "messages": [ + {"role": "user", "content": "What is your name?"} + ], + "max_tokens": 200, "temperature": 0 }' @@ -197,4 +215,4 @@ INFO: Application shutdown complete. Finally, you can exit the container by using `ctrl-D`. :::: -::::: +::::: \ No newline at end of file diff --git a/docs/source/tutorials/multi_xpu_GLM-4.5.md b/docs/source/tutorials/multi_xpu_GLM-4.5.md index da07b74..421838d 100644 --- a/docs/source/tutorials/multi_xpu_GLM-4.5.md +++ b/docs/source/tutorials/multi_xpu_GLM-4.5.md @@ -17,10 +17,9 @@ docker run -itd \ -v /usr/local/bin/:/usr/local/bin/ \ -v /lib/x86_64-linux-gnu/libxpunvidia-ml.so.1:/lib/x86_64-linux-gnu/libxpunvidia-ml.so.1 \ iregistry.baidu-int.com/hac_test/aiak-inference-llm:xpu_dev_20251113_221821 bash - + docker exec -it glm-vllm-01011 /bin/bash ``` - ### Offline Inference on multi XPU Start the server in a container: @@ -31,7 +30,7 @@ import os from vllm import LLM, SamplingParams def main(): - + model_path = "/data/GLM-4.5" llm_params = { @@ -51,7 +50,7 @@ def main(): "content": [ { "type": "text", - "text": "Hello, who are you?" + "text": "你好,请问你是谁?" } ] } @@ -69,8 +68,8 @@ def main(): response = outputs[0].outputs[0].text print("=" * 50) - print("Input content:", messages) - print("Model response:\n", response) + print("输入内容:", messages) + print("模型回复:\n", response) print("=" * 50) if __name__ == "__main__": @@ -84,10 +83,12 @@ If you run this script successfully, you can see the info shown below: ```bash ================================================== -Input content: [{'role': 'user', 'content': [{'type': 'text', 'text': 'Hello, who are you?'}]}] -Model response: +输入内容: [{'role': 'user', 'content': [{'type': 'text', 'text': '你好,请问你是谁?'}]}] +模型回复: -Well, the user asked a rather direct question about identity. This question seems simple, but there could be several underlying intentions—perhaps they are testing my reliability for the first time, or they simply want to confirm the identity of the conversational partner. From the common positioning of AI assistants, the user has provided a clear and flat way to define identity while leaving room for potential follow-up questions.\n\nThe user used "you" instead of "your", which leans towards a more informal tone, so the response style can be a bit more relaxed. However, since this is the initial response, it is better to maintain a moderate level of professionalism. Mentioning +嗯,用户问了一个相当身份的直接问题。这个问题看似简单,但背后可能 +有几种可能性意—ta或许初次测试我的可靠性,或者单纯想确认对话方。从AI助手的常见定位,用户给出清晰平的方式明确身份,同时为后续可能 +的留出生进行的空间。\n\n用户用“你”这个“您”,语气更倾向非正式交流,所以回复风格可以轻松些。不过既然是初次回复,保持适度的专业性比较好稳妥。提到 ================================================== ``` @@ -113,9 +114,8 @@ python -m vllm.entrypoints.openai.api_server \ --no-enable-chunked-prefill \ --distributed-executor-backend mp \ --served-model-name GLM-4.5 \ - --compilation-config '{"splitting_ops": ["vllm.unified_attention_with_output_kunlun", "vllm.unified_attention", "vllm.unified_attention_with_output", "vllm.mamba_mixer2"]}' > log_glm_plugin.txt 2>&1 & + --compilation-config '{"splitting_ops": ["vllm.unified_attention_with_output_kunlun", "vllm.unified_attention", "vllm.unified_attention_with_output", "vllm.mamba_mixer2"]}' > log_glm_plugin.txt 2>&1 & ``` - If your service start successfully, you can see the info shown below: ```bash @@ -132,7 +132,7 @@ curl http://localhost:8989/v1/chat/completions \ -d '{ "model": "GLM-4.5", "messages": [ - {"role": "user", "content": "Hello, who are you?"} + {"role": "user", "content": "你好,请问你是谁?"} ], "max_tokens": 100, "temperature": 0.7 @@ -142,7 +142,7 @@ curl http://localhost:8989/v1/chat/completions \ If you query the server successfully, you can see the info shown below (client): ```bash -{"id":"chatcmpl-6af7318de7394bc4ae569e6324a162fa","object":"chat.completion","created":1763101638,"model":"GLM-4.5","choices":[{"index":0,"message":{"role":"assistant","content":"\nThe user asked, \"Hello, who are you?\" This is a question about my identity. First, I need to confirm the user's intent. They might be using this service for the first time or have never interacted with similar AI assistants before, so they want to know my background and capabilities.\n\nNext, I should ensure my answer is clear and friendly, focusing on key points: who I am, who developed me, and what I can do. I should avoid technical jargon and keep the response conversational so it's easy to understand.\n\nAdditionally, the user may have potential needs, such as wanting to know what I am capable of.","refusal":null,"annotations":null,"audio":null,"function_call":null,"tool_calls":[],"reasoning_content":null},"logprobs":null,"finish_reason":"length","stop_reason":null}],"service_tier":null,"system_fingerprint":null,"usage":{"prompt_tokens":11,"total_tokens":111,"completion_tokens":100,"prompt_tokens_details":null},"prompt_logprobs":null,"kv_tr +{"id":"chatcmpl-6af7318de7394bc4ae569e6324a162fa","object":"chat.completion","created":1763101638,"model":"GLM-4.5","choices":[{"index":0,"message":{"role":"assistant","content":"\n用户问“你好,请问你是谁?”,这是一个应该是个了解我的身份。首先,我需要确认用户的需求是什么。可能他们是第一次使用这个服务,或者之前没有接触过类似的AI助手,所以想确认我的背景和能力。 \n\n接下来,我要确保回答清晰明了,同时友好关键点:我是谁,由谁开发,能做什么。需要避免使用专业术语,保持口语化,让不同容易理解。 \n\n然后,用户可能有潜在的需求,比如想了解我能","refusal":null,"annotations":null,"audio":null,"function_call":null,"tool_calls":[],"reasoning_content":null},"logprobs":null,"finish_reason":"length","stop_reason":null}],"service_tier":null,"system_fingerprint":null,"usage":{"prompt_tokens":11,"total_tokens":111,"completion_tokens":100,"prompt_tokens_details":null},"prompt_logprobs":null,"kv_tr ``` Logs of the vllm server: @@ -150,4 +150,4 @@ Logs of the vllm server: ```bash (APIServer pid=54567) INFO: 127.0.0.1:60338 - "POST /v1/completions HTTP/1.1" 200 OK (APIServer pid=54567) INFO 11-13 14:35:48 [loggers.py:123] Engine 000: Avg prompt throughput: 0.5 tokens/s, Avg generation throughput: 0.7 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 0.0% -``` +``` \ No newline at end of file diff --git a/docs/source/tutorials/single_xpu_Qwen3-8B.md b/docs/source/tutorials/single_xpu_Qwen3-8B.md index 154bc08..0af7161 100644 --- a/docs/source/tutorials/single_xpu_Qwen3-8B.md +++ b/docs/source/tutorials/single_xpu_Qwen3-8B.md @@ -16,7 +16,7 @@ if [ $XPU_NUM -gt 0 ]; then DOCKER_DEVICE_CONFIG="${DOCKER_DEVICE_CONFIG} --device=/dev/xpuctrl:/dev/xpuctrl" fi -export build_image="xxxxxxxxxxxxxxxxx" +export build_image="xxxxxxxxxxxxxxxxx" docker run -itd ${DOCKER_DEVICE_CONFIG} \ --net=host \ @@ -58,7 +58,7 @@ def main(): "content": [ { "type": "text", - "text": "tell a joke" + "text": "说个笑话" } ] } @@ -76,8 +76,8 @@ def main(): response = outputs[0].outputs[0].text print("=" * 50) - print("Input content:", messages) - print("Model response:\n", response) + print("输入内容:", messages) + print("模型回复:\n", response) print("=" * 50) if __name__ == "__main__": @@ -91,18 +91,16 @@ If you run this script successfully, you can see the info shown below: ```bash ================================================== -Input content: [{'role': 'user', 'content': [{'type': 'text', 'text': 'tell a joke'}]}] -Model response: +输入内容: [{'role': 'user', 'content': [{'type': 'text', 'text': '说个笑话'}]}] +模型回复: +好的,用户让我讲个笑话。首先,我需要考虑用户的需求。他们可能只是想轻松一下,或者需要一些娱乐。接下来,我要选择一个适合的笑话,不要太复杂,容易理解,同时也要有趣味性。 -Okay, the user asked me to tell a joke. First, I need to consider the user's needs. They might just want to relax or need some entertainment. Next, I need to choose a suitable joke that is not too complicated, easy to understand, and also interesting. +用户可能希望笑话是中文的,所以我要确保笑话符合中文的语言习惯和文化背景。我需要避免涉及敏感话题,比如政治、宗教或者可能引起误解的内容。然后,我得考虑笑话的结构,通常是一个设置和一个出人意料的结尾,这样能带来笑点。 +例如,可以讲一个关于日常生活的小幽默,比如动物或者常见的场景。比如,一只乌龟和兔子赛跑的故事,但加入一些反转。不过要确保笑话的长度适中,不要太长,以免用户失去兴趣。另外,要注意用词口语化,避免生硬或复杂的句子结构。 -The user might expect the joke to be in Chinese, so I need to ensure that the joke conforms to the language habits and cultural background of Chinese. I need to avoid sensitive topics, such as politics, religion, or anything that might cause misunderstanding. Then, I have to consider the structure of the joke, which usually involves a setup and an unexpected ending to create humor. - -For example, I could tell a light-hearted story about everyday life, such as animals or common scenarios. For instance, the story of a turtle and a rabbit racing, but with a twist. However, I need to ensure that the joke is of moderate length and not too long, so the user doesn't lose interest. Additionally, I should pay attention to using colloquial language and avoid stiff or complex sentence structures. - -I might also need to check if this joke is common to avoid repetition. If the user has heard something similar before, I may need to come up with a different angle. +可能还要检查一下这个笑话是否常见,避免重复。如果用户之前听过类似的,可能需要 ================================================== ``` @@ -132,7 +130,6 @@ python -m vllm.entrypoints.openai.api_server \ "vllm.unified_attention", "vllm.unified_attention_with_output", "vllm.mamba_mixer2"]}' \ ``` - If your service start successfully, you can see the info shown below: ```bash @@ -165,4 +162,4 @@ Logs of the vllm server: ```bash (APIServer pid=54567) INFO: 127.0.0.1:60338 - "POST /v1/completions HTTP/1.1" 200 OK (APIServer pid=54567) INFO 11-13 14:35:48 [loggers.py:123] Engine 000: Avg prompt throughput: 0.5 tokens/s, Avg generation throughput: 0.7 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 0.0% -``` +``` \ No newline at end of file diff --git a/docs/source/user_guide/configuration/env_vars.md b/docs/source/user_guide/configuration/env_vars.md index b71c61d..9bb4d5e 100644 --- a/docs/source/user_guide/configuration/env_vars.md +++ b/docs/source/user_guide/configuration/env_vars.md @@ -14,4 +14,4 @@ vllm-kunlun uses the following environment variables to configure the system: | `export XMLIR_FORCE_USE_XPU_GRAPH` | `1` | ***\*Forces the enablement of XPU Graph mode.\****. This can capture and optimize the model execution graph, significantly boosting inference performance. | | `export VLLM_HOST_IP` | `$(hostname -i)` | ***\*Sets the host IP address for the vLLM service\****. This uses a shell command to dynamically get the current host's internal IP. It's used for inter-node communication in a distributed environment. | | `export XMLIR_ENABLE_MOCK_TORCH_COMPILE` | `false` | ***\*Disable Mock Torch Compile Function\****. Set to `false` to ensure the actual compilation and optimization flow is used, rather than mock mode. | -| `FUSED_QK_ROPE_OP` | `0` | ***\*Control whether to use the Fused QK-Norm and RoPE implementation\****. Default is `0` (use original/standard RoPE). Setting to `1` may be used to enable QWEN3. | \ No newline at end of file +| `USE_ORI_ROPE` | `1` | ***\*Control whether to use the original RoPE (Rotate Position Encoding) implementation\****. Default is `1` (use original/standard RoPE). Setting to `0` may be used to enable QWEN3 (possibly the specific quantization or optimization technique of KunlunCore), but this requires specific model support. | \ No newline at end of file diff --git a/docs/source/user_guide/feature_guide/graph_mode.md b/docs/source/user_guide/feature_guide/graph_mode.md index 463d703..ea36b2b 100644 --- a/docs/source/user_guide/feature_guide/graph_mode.md +++ b/docs/source/user_guide/feature_guide/graph_mode.md @@ -42,7 +42,7 @@ Online example: python -m vllm.entrypoints.openai.api_server \ --host 0.0.0.0 \ --port 8000 \ - --model /models/Qwen3-8B-Instruct\ + --model /models/Qwen3-8B\ --gpu-memory-utilization 0.9 \ --trust-remote-code \ --max-model-len 32768 \ @@ -52,9 +52,17 @@ python -m vllm.entrypoints.openai.api_server \ --no-enable-chunked-prefill \ --distributed-executor-backend mp \ --served-model-name Qwen3-8B-Instruct \ - --compilation-config '{"splitting_ops": ["vllm.unified_attention_with_output_kunlun", - "vllm.unified_attention", "vllm.unified_attention_with_output", - "vllm.mamba_mixer2"]}' \ + --compilation-config '{"splitting_ops": ["vllm.unified_attention", + "vllm.unified_attention_with_output", + "vllm.unified_attention_with_output_kunlun", + "vllm.mamba_mixer2", + "vllm.mamba_mixer", + "vllm.short_conv", + "vllm.linear_attention", + "vllm.plamo2_mamba_mixer", + "vllm.gdn_attention", + "vllm.sparse_attn_indexer"]}' \ + ``` diff --git a/docs/source/user_guide/support_matrix/supported_models.md b/docs/source/user_guide/support_matrix/supported_models.md index f3a6141..fb86800 100644 --- a/docs/source/user_guide/support_matrix/supported_models.md +++ b/docs/source/user_guide/support_matrix/supported_models.md @@ -4,30 +4,12 @@ | Model | Support | W8A8 | LoRA | Tensor Parallel | Expert Parallel | Data Parallel | Piecewise Kunlun Graph | | :------------ | :------------ | :--- | :--- | :-------------- | :-------------- | :------------ | :--------------------- | -| Qwen2 | ✅ | | ✅ | ✅ | | ✅ | ✅ | -| Qwen2.5 | ✅ | | ✅ | ✅ | | ✅ | ✅ | | Qwen3 | ✅ | | ✅ | ✅ | | ✅ | ✅ | | Qwen3-Moe | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| Qwen3-Coder | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| QwQ-32B | ✅ | | | ✅ | | ✅ | ✅ | -| LLama2 | ✅ | | | ✅ | | ✅ | ✅ | -| LLama3 | ✅ | | | ✅ | | ✅ | ✅ | -| LLama3.1 | ✅ | | | ✅ | | ✅ | ✅ | -| GLM-4.5 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| GLM-4.5-Air | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| Qwen3-next | 🔜Comming soon | | | | | | | -| gpt-oss | 🔜Comming soon | | | | | | | -| DeepSeek-V3 | 🔜Comming soon | | | | | | | -| DeepSeek-V3.2 | 🔜Comming soon | | | | | | | +| Qwen3-Next | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | + ## Multimodal Language Models | Model | Support | W8A8 | LoRA | Tensor Parallel | Expert Parallel | Data Parallel | Piecewise Kunlun Graph | | :----------- | :------------ | :--- | :--- | :-------------- | :-------------- | :------------ | :--------------------- | -|Qianfan-VL | ✅ | | | ✅| |✅ |✅| -| Qwen2.5VL | ✅ | | | ✅ | | ✅ | ✅ | -| InternVL2.5 | ✅ | | | ✅ | | ✅ | ✅ | -| InternVL3 | ✅ | | | ✅ | | ✅ | ✅ | -| InternVL3.5 | ✅ | | | ✅ | | ✅ | ✅ | -| InternS1 | ✅ | | | ✅ | | ✅ | ✅ | -| Qwen2.5-Omni | 🔜Comming soon | | | | | | | -| Qwen3-VL | 🔜Comming soon | | | | | | | \ No newline at end of file +| Qwen3-VL | ✅ | | | ✅ | | ✅ | ✅ | diff --git a/pyproject.toml b/pyproject.toml index 790842d..c8fdc3f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "vllm-kunlun" -version = "0.10.1.1" +version = "0.11.0" description = "vLLM Kunlun3 backend plugin" readme = "README.md" requires-python = ">=3.10" diff --git a/requirements.txt b/requirements.txt index 4c951bd..75f77e4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,9 @@ +--index-url https://pip.baidu-int.com/simple/ +--trusted-host pip.baidu.com + setuptools==80.9.0 +opencv-python-headless==4.12.0.88 +llguidance==0.7.11 black==23.3.0 blake3==1.0.5 cachetools==6.1.0 @@ -24,11 +29,10 @@ pydantic==2.11.7 tokenizers>=0.21.2 uvloop==0.21.0 prometheus-fastapi-instrumentator==7.1.0 -transformers>=4.56.1 +transformers==4.57.0 +# 基础构建依赖 hatchling>=1.25 build>=1.0.3 pytest mock - - diff --git a/setup_env.sh b/setup_env.sh old mode 100644 new mode 100755 index 235b227..c282400 --- a/setup_env.sh +++ b/setup_env.sh @@ -1,11 +1,13 @@ unset XPU_DUMMY_EVENT export XPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 -export XPU_USE_MOE_SORTED_THRES=1 -export XFT_USE_FAST_SWIGLU=1 +export XFT_USE_FAST_SWIGLU=1 #使用快速swiglu实现 +export XPU_USE_FAST_SWIGLU=1 #使用moe算子中快速swiglu实现 export XMLIR_CUDNN_ENABLED=1 export XPU_USE_DEFAULT_CTX=1 -export XMLIR_FORCE_USE_XPU_GRAPH=1 -export XPU_USE_FAST_SWIGLU=1 +export XMLIR_FORCE_USE_XPU_GRAPH=1 # 优化图间sync +export XPU_USE_MOE_SORTED_THRES=128 # Moe sort threshold export VLLM_HOST_IP=$(hostname -i) -export XMLIR_ENABLE_MOCK_TORCH_COMPILE=false -export FUSED_QK_ROPE_OP=0 \ No newline at end of file +export XMLIR_ENABLE_MOCK_TORCH_COMPILE=false +VLLM_USE_V1=1 +##默认值为1,设置为0启用QWN3融合大算子 +USE_ORI_ROPE=1 \ No newline at end of file diff --git a/vllm_kunlun/__init__.py b/vllm_kunlun/__init__.py index a255d20..d0124d5 100644 --- a/vllm_kunlun/__init__.py +++ b/vllm_kunlun/__init__.py @@ -1,21 +1,3 @@ -# -# Copyright (c) 2025 Baidu, Inc. All Rights Reserved. -# Author: Xinyu Dong -# Email: dongxinyu03@baidu.com -# This file is a part of the vllm-kunlun project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - """vllm kunlun init""" from .platforms import current_platform import sys @@ -25,26 +7,24 @@ import builtins import os import time import vllm.envs as envs - OLD_IMPORT_HOOK = builtins.__import__ - - def _custom_import(module_name, globals=None, locals=None, fromlist=(), level=0): try: start_time = time.time() - # Module mapping table + # 模块映射表 module_mappings = { "vllm.model_executor.layers.fused_moe.layer": "vllm_kunlun.ops.fused_moe.layer", "vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe": "vllm_kunlun.ops.quantization.compressed_tensors_moe", "vllm.compilation.wrapper": "vllm_kunlun.compilation.wrapper", + "vllm.v1.worker.gpu_model_runner": "vllm_kunlun.v1.worker.gpu_model_runner" } - # Keep the original imported modules + # 需要保持原始导入的模块 original_imports = [ "vllm.model_executor.layers.fused_moe.base", "vllm.model_executor.layers.fused_moe.config", - "vllm.model_executor.layers.fused_moe.layer", + "vllm.model_executor.layers.fused_moe.layer" ] if module_name in original_imports: @@ -55,7 +35,7 @@ def _custom_import(module_name, globals=None, locals=None, fromlist=(), level=0) globals=globals, locals=locals, fromlist=fromlist, - level=level, + level=level ) if module_name in module_mappings: @@ -68,15 +48,12 @@ def _custom_import(module_name, globals=None, locals=None, fromlist=(), level=0) return module relative_mappings = { - ( - "compressed_tensors_moe", - "compressed_tensors", - ): "vllm_kunlun.ops.quantization.compressed_tensors_moe", + ("compressed_tensors_moe", "compressed_tensors"): "vllm_kunlun.ops.quantization.compressed_tensors_moe", ("layer", "fused_moe"): "vllm_kunlun.ops.fused_moe.layer", } if level == 1: - parent = globals.get("__package__", "").split(".")[-1] if globals else "" + parent = globals.get('__package__', '').split('.')[-1] if globals else '' key = (module_name, parent) if key in relative_mappings: if module_name in sys.modules: @@ -91,15 +68,18 @@ def _custom_import(module_name, globals=None, locals=None, fromlist=(), level=0) pass return OLD_IMPORT_HOOK( - module_name, globals=globals, locals=locals, fromlist=fromlist, level=level + module_name, + globals=globals, + locals=locals, + fromlist=fromlist, + level=level ) - def import_hook(): """Apply import hook for VLLM Kunlun""" if not int(os.environ.get("DISABLE_KUNLUN_HOOK", "0")): builtins.__import__ = _custom_import - + try: modules_to_preload = [ "vllm_kunlun.ops.quantization.compressed_tensors_moe", @@ -112,31 +92,39 @@ def import_hook(): except Exception: pass - def register(): """Register the Kunlun platform""" from .utils import redirect_output - from .vllm_utils_wrapper import ( - direct_register_custom_op, - patch_annotations_for_schema, - ) - + from .vllm_utils_wrapper import direct_register_custom_op, patch_annotations_for_schema + patch_bitsandbytes_loader() import_hook() if envs.VLLM_USE_V1: - patch_V1blockTable() + # patch_V1blockTable() patch_V1top_p_K() - patch_V1penalties() + # TODO fixed fast top & k for vLLM 0.10.2, + pass else: patch_sampler() return "vllm_kunlun.platforms.kunlun.KunlunPlatform" - def register_model(): """Register models for training and inference""" from .models import register_model as _reg - _reg() +# [monkey patach sampler] +import sys +import sys, importlib, warnings + +def patch_bitsandbytes_loader(): + try: + # 载入你插件里自定义的 direct_register_custom_op 实现 + custom_utils = importlib.import_module("vllm_kunlun.models.model_loader.bitsandbytes_loader") + # 覆盖 vllm.utils + sys.modules["vllm.model_executor.model_loader.bitsandbytes_loader"] = custom_utils + print("[vllm_kunlun] bitsandbytes_loader patched ->", custom_utils.__file__) + except Exception as e: + warnings.warn(f"[vllm_kunlun] bitsandbytes_loader patch failed: {e!r}") def patch_sampler(): try: @@ -149,24 +137,12 @@ def patch_sampler(): def patch_V1top_p_K(): try: - custom_sampler = importlib.import_module( - "vllm_kunlun.v1.sample.ops.topk_topp_sampler" - ) + custom_sampler = importlib.import_module("vllm_kunlun.v1.sample.ops.topk_topp_sampler") sys.modules["vllm.v1.sample.ops.topk_topp_sampler"] = custom_sampler print("[vllm_kunlun] V1sampler top p & k patched ->", custom_sampler.__file__) except Exception as e: warnings.warn(f"[vllm_kunlun] V1 sampler top p & k patch failed: {e!r}") - -def patch_V1penalties(): - try: - custom_sampler = importlib.import_module("vllm_kunlun.v1.sample.ops.penalties") - sys.modules["vllm.v1.sample.ops.penalties"] = custom_sampler - print("[vllm_kunlun] V1sampler penalties patched ->", custom_sampler.__file__) - except Exception as e: - warnings.warn(f"[vllm_kunlun] V1 sampler penalties patch failed: {e!r}") - - def patch_V1blockTable(): try: custom_sampler = importlib.import_module("vllm_kunlun.v1.worker.block_table") @@ -175,6 +151,5 @@ def patch_V1blockTable(): except Exception as e: warnings.warn(f"[vllm_kunlun] V1 block table patch failed: {e!r}") - -# Automatically apply patches when modules are imported +# 在模块导入时自动应用补丁 import_hook() diff --git a/vllm_kunlun/compilation/wrapper.py b/vllm_kunlun/compilation/wrapper.py index 73f5d12..c03d8aa 100644 --- a/vllm_kunlun/compilation/wrapper.py +++ b/vllm_kunlun/compilation/wrapper.py @@ -1,20 +1,6 @@ -# -# Copyright (c) 2025 Baidu, Inc. All Rights Reserved. -# Author: Bao Qian -# Email: baoqian@baidu.com -# This file is a part of the vllm-kunlun project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + import os import sys from abc import abstractmethod @@ -46,7 +32,7 @@ class TorchCompileWrapperWithCustomDispatcher: def __init__(self, compiled_callable: Optional[Callable] = None, compilation_level: int = 0): - from vllm.config import get_current_vllm_config + from vllm.config import get_current_vllm_config, CUDAGraphMode vllm_config = get_current_vllm_config() self.vllm_config = vllm_config if compiled_callable is None: @@ -61,9 +47,13 @@ class TorchCompileWrapperWithCustomDispatcher: compiled_callable = torch.compile( self.forward, - fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, + fullgraph=True, #envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, backend=backend, options=options) + + # print(vllm_config.compilation_config) + # vllm_config.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE + # vllm_config.compilation_config.cudagraph_capture_sizes = [32768] self.compiled_callable = compiled_callable self.original_code_object = self.__class__.forward.__code__ @@ -126,7 +116,12 @@ class TorchCompileWrapperWithCustomDispatcher: decompiled_file) except Exception: pass - + # if self.vllm_config.compilation_config.use_cudagraph and \ + # "update" in new_code.co_names: + # import depyf + # src = depyf.decompile(new_code) + # msg = "Assigning / modifying buffers of nn.Module during forward pass is not allowed when using cudagraph inside the compiler because it will cause silent errors. Please use eager mode or fix the code. The following code contains clues about which buffer is being modified (please search for the usage of the function `update`):\n" + src # noqa + # raise RuntimeError(msg) @contextmanager def dispatch_to_code(self, index: int): diff --git a/vllm_kunlun/distributed/kunlun_communicator.py b/vllm_kunlun/distributed/kunlun_communicator.py index aac3b8a..b3658ca 100644 --- a/vllm_kunlun/distributed/kunlun_communicator.py +++ b/vllm_kunlun/distributed/kunlun_communicator.py @@ -1,20 +1,3 @@ -# -# Copyright (c) 2025 Baidu, Inc. All Rights Reserved. -# Author: Bao Qian, Dong Xinyu -# Email: baoqian@baidu.com, dongxinyu03@baidu.com -# This file is a part of the vllm-kunlun project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. """kunlun_communicator""" from contextlib import contextmanager from typing import Optional diff --git a/vllm_kunlun/lora/ops/kunlun_ops/__init__.py b/vllm_kunlun/lora/ops/kunlun_ops/__init__.py deleted file mode 100644 index ccf44c7..0000000 --- a/vllm_kunlun/lora/ops/kunlun_ops/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -"""# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project""" - -from vllm_kunlun.lora.ops.kunlun_ops.lora_ops import (bgmv_expand,bgmv_expand_slice, bgmv_shrink, - sgmv_expand, sgmv_expand_slice, - sgmv_shrink) - - -__all__ = [ - "bgmv_expand", - "bgmv_expand_slice", - "bgmv_shrink", - "sgmv_expand", - "sgmv_expand_slice", - "sgmv_shrink" -] \ No newline at end of file diff --git a/vllm_kunlun/lora/ops/kunlun_ops/lora_ops.py b/vllm_kunlun/lora/ops/kunlun_ops/lora_ops.py deleted file mode 100644 index 7196eca..0000000 --- a/vllm_kunlun/lora/ops/kunlun_ops/lora_ops.py +++ /dev/null @@ -1,443 +0,0 @@ -# -# Copyright (c) 2025 Baidu, Inc. All Rights Reserved. -# -# Author: Wang Hao -# Email: wanghao129@baidu.com -# -# This file is a part of the vllm-kunlun project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""kunlun_ops for lora""" - -import torch -from torch._C import dtype - - -def sgmv_shrink( - inputs: torch.Tensor, - lora_a_weights: torch.Tensor, - output_tensor: torch.Tensor, - block_statistic: torch.Tensor, - sorted_tokens_num_lod: torch.Tensor, - moe_index: torch.Tensor, - expert_m: torch.Tensor, - b_seq_start_loc: torch.Tensor, - seq_len_tensor: torch.Tensor, - lora_indices_tensor: torch.Tensor, - batches: int, - max_seq_length: int, - token_nums: int, - scaling: float, -): - """ - sgmv_shrink - """ - - expert_num = 9 - device = inputs.device - - lora_ids = lora_indices_tensor.repeat_interleave(seq_len_tensor, dim=0).to( - device=device, dtype=torch.int32 - ) - - lora_ids.masked_fill_(lora_ids < 0, expert_num - 1).unsqueeze_(1) - - - - torch.ops._C.gen_block_statistic(lora_ids, block_statistic) - - - inputs_sorted = torch.zeros_like(inputs, dtype=inputs.dtype, device=device) - torch.ops._C.moe_pre_sorted( - inputs, - lora_ids, - block_statistic, - inputs_sorted, - moe_index, - expert_m, - sorted_tokens_num_lod - ) - - - output_tensor.unsqueeze_(1) - - torch.ops._C.moe_fc( - x=inputs_sorted, - weight=lora_a_weights, - sorted_tokens_num_lod=sorted_tokens_num_lod, - sorted_tokens_idx=moe_index, - moe_topk=1, - y=output_tensor, - act=None, - x_perchannel_max=None, - w_perchannel_max=None, - topk_ids=None, - topk_w=None, - bias=None, - tgemm_type=None, - tweight_type=None, - scale_n=0, - scale_k=0, - use_pack_int4=False - ) - - output_tensor.squeeze_(1).mul_(scaling) - - return output_tensor - - -def sgmv_expand(inputs: torch.Tensor, - lora_b_weights: torch.Tensor, - output_tensor: torch.Tensor, - block_statistic: torch.Tensor, - sorted_tokens_num_lod: torch.Tensor, - moe_index: torch.Tensor, - b_seq_start_loc: torch.Tensor, - seq_len_tensor: torch.Tensor, - lora_indices_tensor: torch.Tensor, - batches: int, - max_seq_length: int, - token_nums: int, - add_inputs: bool = False): - """ - sgmv_expand - """ - - - expert_num = 9 - device = inputs.device - - - lora_ids = lora_indices_tensor.repeat_interleave(seq_len_tensor, dim=0).to( - device=device, dtype=torch.int32 - ) - - lora_ids.masked_fill_(lora_ids < 0, expert_num - 1).unsqueeze_(1) - - out = torch.zeros((token_nums, 1, slice_size), dtype=inputs.dtype, device=device) - - - torch.ops._C.moe_fc( - x=inputs, - weight=lora_b_weights, - sorted_tokens_num_lod=sorted_tokens_num_lod, - sorted_tokens_idx=moe_index, - moe_topk=1, - y=out, - act=None, - x_perchannel_max=None, - w_perchannel_max=None, - topk_ids=None, - topk_w=None, - bias=None, - tgemm_type=None, - tweight_type=None, - scale_n=0, - scale_k=0, - use_pack_int4=False - ) - - output_post = out.squeeze(1) - torch.ops._C.moe_post( - output_post, - moe_index.unsqueeze(1), - normed_scale, - normed_scale, - output_post - ) - - - common_len = min(output_post.shape[1], output_tensor.shape[1]) - - limit = min(output_post.shape[0], output_tensor.shape[0]) - - - if add_inputs: - output_tensor[:limit, :common_len] += output_post[:limit, :common_len] - else: - output_tensor[:limit, :common_len] = output_post[:limit, :common_len] - - return output_tensor - - -def sgmv_expand_slice(inputs: torch.Tensor, - lora_b_weights: torch.Tensor, - output_tensor: torch.Tensor, - block_statistic: torch.Tensor, - sorted_tokens_num_lod: torch.Tensor, - moe_index: torch.Tensor, - normed_scale: torch.Tensor, - b_seq_start_loc: torch.Tensor, - seq_len_tensor: torch.Tensor, - lora_indices_tensor: torch.Tensor, - batches: int, - max_seq_length: int, - token_nums: int, - slice_offset: int, - slice_size: int, - add_inputs: bool = False): - - """ - sgmv_expand_slice - """ - - expert_num = 9 - device = inputs.device - - lora_ids = lora_indices_tensor.repeat_interleave(seq_len_tensor, dim=0).to( - device=device, dtype=torch.int32 - ) - - lora_ids.masked_fill_(lora_ids < 0, expert_num - 1).unsqueeze_(1) - - - out = torch.zeros((token_nums, 1, slice_size), dtype=inputs.dtype, device=device) - - - torch.ops._C.moe_fc( - x=inputs, - weight=lora_b_weights, - sorted_tokens_num_lod=sorted_tokens_num_lod, - sorted_tokens_idx=moe_index, - moe_topk=1, - y=out, - act=None, - x_perchannel_max=None, - w_perchannel_max=None, - topk_ids=None, - topk_w=None, - bias=None, - tgemm_type=None, - tweight_type=None, - scale_n=0, - scale_k=0, - use_pack_int4=False - ) - - output_post = out.squeeze(1) - torch.ops._C.moe_post( - output_post, - moe_index.unsqueeze(1), - normed_scale, - normed_scale, - output_post - ) - - - slice_end = slice_offset + slice_size - actual_slice_size = min(slice_size, output_tensor.shape[1] - slice_offset) - - limit = min(output_post.shape[0], output_tensor.shape[0]) - - - if add_inputs: - output_tensor[:limit, slice_offset:slice_end] += output_post[:limit, :actual_slice_size] - else: - output_tensor[:limit, slice_offset:slice_end] = output_post[:limit, :actual_slice_size] - - return output_tensor - - -def bgmv_shrink( - inputs: torch.Tensor, # [m, hidden_dim] - lora_a_weights: torch.Tensor, # [n, 1, r, hidden_dim] - output_tensor: torch.Tensor, # [m, r] - block_statistic: torch.Tensor, - sorted_tokens_num_lod: torch.Tensor, - moe_index: torch.Tensor, - expert_m: torch.Tensor, - lora_indices_tensor: torch.Tensor, # [m] - scaling: float = 1.0 -) -> torch.Tensor: - """ - bgmv_shrink - """ - - expert_num = 9 - - lora_ids = lora_indices_tensor.to(dtype=torch.int32, device=inputs.device) - lora_ids.masked_fill_(lora_ids < 0, expert_num - 1) - - torch.ops._C.gen_block_statistic(lora_ids.unsqueeze(1), block_statistic) - - inputs_sorted = torch.empty_like(inputs, dtype=inputs.dtype, device=inputs.device) - - torch.ops._C.moe_pre_sorted( - inputs, - lora_ids.unsqueeze(1), - block_statistic, - inputs_sorted, - moe_index, - expert_m, - sorted_tokens_num_lod - ) - - output_tensor.unsqueeze_(1) # Change to [m, 1, r] - torch.ops._C.moe_fc( - x=inputs_sorted, - weight=lora_a_weights, - sorted_tokens_num_lod=sorted_tokens_num_lod, - sorted_tokens_idx=moe_index, - moe_topk=1, - y=output_tensor, - act=None, - x_perchannel_max=None, - w_perchannel_max=None, - topk_ids=None, - topk_w=None, - bias=None, - tgemm_type=None, - tweight_type=None, - scale_n=0, - scale_k=0, - use_pack_int4=False - ) - - output_tensor.squeeze_(1).mul_(scaling) - - return output_tensor - - -def bgmv_expand(inputs: torch.Tensor, - lora_b_weights: torch.Tensor, - output_tensor: torch.Tensor, - block_statistic: torch.Tensor, - sorted_tokens_num_lod: torch.Tensor, - moe_index: torch.Tensor, - lora_indices_tensor: torch.Tensor, - add_inputs: bool = True): - """" - bgmv_expand - """ - - - expert_num = 9 - device = inputs.device - - - - - lora_ids = lora_indices_tensor.to(dtype=torch.int32, device=inputs.device) - lora_ids.masked_fill_(lora_ids < 0, expert_num - 1) - - out = torch.zeros((inputs.shape[0], 1, slice_size), dtype=inputs.dtype, device=device) - - - torch.ops._C.moe_fc( - x=inputs, - weight=lora_b_weights, - sorted_tokens_num_lod=sorted_tokens_num_lod, - sorted_tokens_idx=moe_index, - moe_topk=1, - y=out, - act=None, - x_perchannel_max=None, - w_perchannel_max=None, - topk_ids=None, - topk_w=None, - bias=None, - tgemm_type=None, - tweight_type=None, - scale_n=0, - scale_k=0, - use_pack_int4=False - ) - - - - - - - output_post = out.squeeze(1) - torch.ops._C.moe_post(output_post, moe_index.unsqueeze(1), normed_scale, normed_scale, output_post) - - - limit = output_tensor.shape[0] - if output_post.shape[0] == 1 and output_tensor.shape[0] != 1: - limit = 1 - - # LoRA adapter and model may add different amounts of padding to output - common_len = min(output_post.shape[1], output_tensor.shape[1]) - - if add_inputs: - output_tensor[:, :common_len] += output_post[:limit, :common_len] - else: - output_tensor[:, :common_len] = output_post[:limit, :common_len] - - return output_tensor - - -def bgmv_expand_slice( - inputs: torch.Tensor, - lora_b_weights: torch.Tensor, - output_tensor: torch.Tensor, - block_statistic: torch.Tensor, - sorted_tokens_num_lod: torch.Tensor, - moe_index: torch.Tensor, - normed_scale: torch.Tensor, - lora_indices_tensor: torch.Tensor, - slice_offset: int, - slice_size: int, - add_inputs: bool = True -): - """ - bgmv_expand_slice - """ - - expert_num = 9 - device = inputs.device - - - - - lora_ids = lora_indices_tensor.to(dtype=torch.int32, device=inputs.device) - lora_ids.masked_fill_(lora_ids < 0, expert_num - 1) - - out = torch.zeros((inputs.shape[0], 1, slice_size), dtype=inputs.dtype, device=device) - - torch.ops._C.moe_fc( - x=inputs, - weight=lora_b_weights, - sorted_tokens_num_lod=sorted_tokens_num_lod, - sorted_tokens_idx=moe_index, - moe_topk=1, - y=out, - act=None, - x_perchannel_max=None, - w_perchannel_max=None, - topk_ids=None, - topk_w=None, - bias=None, - tgemm_type=None, - tweight_type=None, - scale_n=0, - scale_k=0, - use_pack_int4=False - ) - - - output_post = out.squeeze(1) - torch.ops._C.moe_post(output_post, moe_index.unsqueeze(1), normed_scale, normed_scale, output_post) - - - slice_end = slice_offset + slice_size - actual_slice_size = min(slice_size, output_tensor.shape[1] - slice_offset) - limit = min(output_post.shape[0], output_tensor.shape[0]) - - - if add_inputs: - output_tensor[:limit, slice_offset:slice_end] += output_post[:limit, :actual_slice_size] - else: - output_tensor[:limit, slice_offset:slice_end] = output_post[:limit, :actual_slice_size] - - return output_tensor diff --git a/vllm_kunlun/lora/punica_wrapper/punica_kunlun.py b/vllm_kunlun/lora/punica_wrapper/punica_kunlun.py deleted file mode 100644 index 0d85ede..0000000 --- a/vllm_kunlun/lora/punica_wrapper/punica_kunlun.py +++ /dev/null @@ -1,547 +0,0 @@ -# -# Copyright (c) 2025 Baidu, Inc. All Rights Reserved. -# Author: Wang Hao -# Email: wanghao129@baidu.com -# This file is a part of the vllm-kunlun project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Based on: -Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). -Punica: Multi-Tenant LoRA Serving. -https://arxiv.org/abs/2310.18547 -""" - -from typing import TYPE_CHECKING, Optional, Union, final - -import torch - - -# SPDX-License-Identifier: Apache-2.0 -from typing import Callable, Optional, Tuple, Union - - -from vllm_kunlun.lora.ops.kunlun_ops import ( - bgmv_expand, - bgmv_expand_slice, - bgmv_shrink, - sgmv_expand, - sgmv_expand_slice, - sgmv_shrink, -) - -from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase -import time - - -# The platforms that are compatible with the PyTorch-native implementation can -# inherit this class -class PunicaWrapperKunlun(PunicaWrapperBase): - """ - PunicaWrapperKunlun with moe_fc - """ - - def __init__( - self, - max_num_batched_tokens: int, - max_batches: int, - device: Union[torch.device, str], - **kwargs, - ): - PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches, device) - - def _shrink_prefill( - self, - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - block_statistic: torch.Tensor, - sorted_tokens_num_lod: torch.Tensor, - moe_index: torch.Tensor, - scale: float, - ): - - expert_m = torch.zeros(9, dtype=torch.int32, device=x.device) - - sgmv_shrink( - x, - w_t_all, - y, - block_statistic, - sorted_tokens_num_lod, - moe_index, - expert_m, - *self.prefill_metadata, - scale, - ) - - def _shrink_decode( - self, - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - block_statistic: torch.Tensor, - sorted_tokens_num_lod: torch.Tensor, - moe_index: torch.Tensor, - scale: float, - ): - - expert_m = torch.zeros(9, dtype=torch.int32, device=x.device) - bgmv_shrink( - x, - w_t_all, - y, - block_statistic, - sorted_tokens_num_lod, - moe_index, - expert_m, - self.token_lora_indices, - scale, - ) - - def _expand_prefill( - self, - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - block_statistic: torch.Tensor, - sorted_tokens_num_lod: torch.Tensor, - moe_index: torch.Tensor, - add_inputs: bool, - ): - - sgmv_expand( - x, - w_t_all, - y, - block_statistic, - sorted_tokens_num_lod, - moe_index, - *self.prefill_metadata, - add_inputs, - ) - - def _expand_decode( - self, - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - block_statistic: torch.Tensor, - sorted_tokens_num_lod: torch.Tensor, - moe_index: torch.Tensor, - add_inputs: bool, - ): - bgmv_expand( - x, - w_t_all, - y, - block_statistic, - sorted_tokens_num_lod, - moe_index, - self.token_lora_indices, - add_inputs, - ) - - def _expand_slice_prefill( - self, - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - block_statistic, - sorted_tokens_num_lod: torch.Tensor, - moe_index: torch.Tensor, - y_offset: int, - y_slice_size: int, - add_inputs: bool, - ): - - normed_scale = torch.ones([y.size(0), 1], dtype=torch.float32, device=x.device) - - sgmv_expand_slice( - x, - w_t_all, - y, - block_statistic, - sorted_tokens_num_lod, - moe_index, - normed_scale, - *self.prefill_metadata, - y_offset, - y_slice_size, - add_inputs, - ) - - def _expand_slice_decode( - self, - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - block_statistic: torch.Tensor, - sorted_tokens_num_lod: torch.Tensor, - moe_index: torch.Tensor, - y_offset: int, - y_slice_size: int, - add_inputs: bool, - ): - - normed_scale = torch.ones([y.size(0), 1], dtype=torch.float32, device=x.device) - - bgmv_expand_slice( - x, - w_t_all, - y, - block_statistic, - sorted_tokens_num_lod, - moe_index, - normed_scale, - self.token_lora_indices, - y_offset, - y_slice_size, - add_inputs, - ) - - def _apply_expand( - self, - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - block_statistic, - sorted_tokens_num_lod: torch.Tensor, - moe_index: torch.Tensor, - y_offset: int, - y_slice_size: int, - add_inputs: bool = True, - ): - """ - Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all` - computation, which is suitable for the - GEMM of lora'b. - """ - - expand_slice_fun: Callable = ( - self._expand_slice_prefill if self.is_prefill else self._expand_slice_decode - ) - expand_slice_fun( - y, - x, - w_t_all, - block_statistic, - sorted_tokens_num_lod, - moe_index, - y_offset, - y_slice_size, - add_inputs, - ) - - def _apply_shrink( - self, - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - block_statistic: torch.Tensor, - sorted_tokens_num_lod: torch.Tensor, - moe_index: torch.Tensor, - scale: float, - ): - """ - Perform the ` y+=x@w_t_all` computation, which is suitable for the - GEMM of lora'a. - When `is_prefill is` true, it indicates that it is currently the - prefill stage, and the `_shrink_prefill` function should be called. - Otherwise, it is the decode stage, and the _shrink_decode function - should be called. - """ - y_org = y - y = y.view(-1, y.shape[-1]) - - shrink_fun: Callable = ( - self._shrink_prefill if self.is_prefill else self._shrink_decode - ) - - shrink_fun( - y, x, w_t_all, block_statistic, sorted_tokens_num_lod, moe_index, scale - ) - - y = y.view_as(y_org) - - def add_shrink( - self, - y: Union[Tuple[torch.Tensor, ...], torch.Tensor], - x: torch.Tensor, - lora_a_stacked: Tuple[torch.Tensor, ...], - block_statistic: torch.Tensor, - sorted_tokens_num_lod: torch.Tensor, - moe_index: torch.Tensor, - scale: float, - **kwargs, - ): - """ - Performs GEMM for multiple slices of lora_a. - When `is_prefill is` true, it indicates that it is currently the - prefill stage, and the `_shrink_prefill` function should be called. - Otherwise, it is the decode stage, and the _shrink_decode function - should be called. - - Semantics: - for i in range(len(lora_a_stacked)): - y[i] += (x @ lora_a_stacked[i]) * scale - - Args: - y (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Output tensors - x (torch.Tensor): Input tensor - lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weights - scale (float): Scaling factor for the operation - """ - - x = x.view(-1, x.shape[-1]) - - for slice_idx in range(len(lora_a_stacked)): # Each slice represents a layer - - self._apply_shrink( - y[slice_idx], - x, - lora_a_stacked[slice_idx], - block_statistic, - sorted_tokens_num_lod, - moe_index, - scale, - ) - - def add_expand( - self, - y: torch.Tensor, - x: Union[Tuple[torch.Tensor, ...], torch.Tensor], - lora_b_stacked: Tuple[torch.Tensor, ...], - block_statistic: torch.Tensor, - sorted_tokens_num_lod: torch.Tensor, - moe_index: torch.Tensor, - lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], - output_slices: Tuple[int, ...], - offset_start: int = 0, - add_inputs=True, - **kwargs, - ) -> None: - """ - Performs GEMM and bias addition for multiple slices of lora_b. - - Semantics: - for i in range(len(lora_b_stacked)): - slice = output_slices[i] - y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + - lora_bias_stacked[i] - offset += slice - - Args: - y (torch.Tensor): Output tensor. - x (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Input tensors - lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight - lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): - bias's weight - output_slices (Tuple[int, ...]): Every slice's size - add_inputs (bool): Defaults to True. - """ - - y_org = y - y = y.view(-1, y.shape[-1]) - offset_left = offset_start - - if lora_bias_stacked is not None: - self._apply_bias( - self.token_lora_indices, y, output_slices, lora_bias_stacked - ) - - for slice_idx in range(len(lora_b_stacked)): - self._apply_expand( - y, - x[slice_idx], - lora_b_stacked[slice_idx], - block_statistic, - sorted_tokens_num_lod, - moe_index, - offset_left, - output_slices[slice_idx], - add_inputs=add_inputs, - ) - offset_left += output_slices[slice_idx] - - y = y.view_as(y_org) - - def add_lora_embedding( - self, - y: torch.Tensor, - x: torch.Tensor, - lora_b_stacked: torch.Tensor, - add_inputs: bool = True, - **kwargs, - ) -> None: - """ - Applies lora specifically for VocabParallelEmbeddingWithLoRA. - - Semantics: - y += x @ lora_b_stacked - - Args: - y (torch.Tensor): Output tensor. - x (torch.Tensor): Input tensor. - lora_b_stacked (torch.Tensor): lora_b's weights. - add_inputs (bool): Default to True. - """ - - expand_fun: Callable = ( - self._expand_prefill if self.is_prefill else self._expand_decode - ) - expand_fun(y, x, lora_b_stacked, add_inputs) - - def add_lora_linear( - self, - y: torch.Tensor, - x: torch.Tensor, - lora_a_stacked: Tuple[torch.Tensor, ...], - lora_b_stacked: Tuple[torch.Tensor, ...], - lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], - scale: float, - output_slices: Tuple[int, ...], - *, - buffer: Optional[Tuple[torch.Tensor, ...]] = None, - **kwargs, - ) -> None: - """ - Applicable to linear-related lora. - - Semantics: - for i in range(len(lora_a_stacked)): - y[i] += ( - x[i].unsqueeze(0) - @ lora_a_stacked[indices[i], layer_idx, :, :] - @ lora_b_stacked[indices[i], layer_idx, :, :] - * scale - ).squeeze(0)+lora_bias_stacked[i] - - Args: - y (torch.Tensor): Output tensor. Will be changed in-place. - x (torch.Tensor): Input tensor - lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weight. - lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight. - lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): lora's bias. - scale (float): Scaling factor. - output_slices (Tuple[int, ...]): Every slice's size. - buffer (Optional[Tuple[torch.Tensor, ...]]): Defaults to None. - """ - - if self.no_lora: - return - - expert_num = 9 - block_statistic = torch.zeros( - [12, expert_num], dtype=torch.int32, device=x.device - ) - sorted_tokens_num_lod = torch.zeros( - expert_num + 1, dtype=torch.int32, device=x.device - ) - token_nums = x.size(0) - moe_index = torch.zeros(token_nums, dtype=torch.int32, device=x.device) - - assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices) - if lora_bias_stacked is not None: - assert len(lora_bias_stacked) == len(output_slices) - y = self._apply_bias( - self.token_lora_indices, y, output_slices, lora_bias_stacked - ) - - if buffer is None: - r = lora_b_stacked[0].size(-1) - buffer = tuple( - torch.zeros((x.size(0), r), dtype=torch.float16, device=x.device) - for _ in range(len(output_slices)) - ) - # [tensor.squeeze_(1) for tensor in lora_a_stacked] - new_lora_a_stacked = tuple(lora_a.squeeze(1) for lora_a in lora_a_stacked) - self.add_shrink( - buffer, - x, - new_lora_a_stacked, - block_statistic, - sorted_tokens_num_lod, - moe_index, - scale, - **kwargs, - ) - # [tensor.unsqueeze_(1) for tensor in lora_a_stacked] - - # [tensor.squeeze_(1) for tensor in lora_b_stacked] - new_lora_b_stacked = tuple(lora_b.squeeze(1) for lora_b in lora_b_stacked) - self.add_expand( - y, - buffer, - new_lora_b_stacked, - block_statistic, - sorted_tokens_num_lod, - moe_index, - None, - output_slices, - add_inputs=True, - **kwargs, - ) - # [tensor.unsqueeze_(1) for tensor in lora_b_stacked] - - def add_lora_logits( - self, - y: torch.Tensor, - x: torch.Tensor, - lora_a_stacked: torch.Tensor, - lora_b_stacked: torch.Tensor, - scale, - *, - buffer: Optional[torch.Tensor] = None, - **kwargs, - ) -> None: - """ - Applies lora specifically for LogitsProcessorWithLoRA. - - Semantics: - buffer = (x @ lora_a_stacked) * scale - y += buffer @ lora_b_stacked - - Args: - y (torch.Tensor): Output tensor. - x (torch.Tensor): Input tensor. - lora_a_stacked (torch.Tensor): lora_a's weights. - lora_b_stacked (torch.Tensor):lora_b's weights. - scale (float): Scaling factor. - buffer (Optional[torch.Tensor]):Default to None. - """ - y_org = y - y = y.view(-1, y.shape[-1]) - x = x.view(-1, x.shape[-1]) - - if lora_a_stacked.dim() == 2: - lora_a_stacked = lora_a_stacked.unsqueeze(0) - if lora_b_stacked.dim() == 2: - lora_b_stacked = lora_b_stacked.unsqueeze(0) - - r = lora_a_stacked.size(-1) - - if buffer is None: - buffer = torch.zeros((x.size(0), r), dtype=torch.float32, device=x.device) - - indices = self.sampler_indices - if indices.max() >= lora_a_stacked.size(0): - indices = torch.clamp(indices, 0, lora_a_stacked.size(0) - 1) - - lora_a_reshaped = lora_a_stacked.transpose(1, 2) - lora_b_reshaped = lora_b_stacked.transpose(1, 2) - - bgmv_shrink(x, lora_a_reshaped, buffer, indices, scale) - bgmv_expand(buffer, lora_b_reshaped, y, indices, add_inputs=True) - - y = y.view_as(y_org) diff --git a/vllm_kunlun/models/__init__.py b/vllm_kunlun/models/__init__.py index 2d3b08f..4eb7de1 100644 --- a/vllm_kunlun/models/__init__.py +++ b/vllm_kunlun/models/__init__.py @@ -7,6 +7,12 @@ def register_model(): from .qwen2_5_vl import Qwen2_5_VLForConditionalGeneration #noqa: F401 from .qwen3 import Qwen3ForCausalLM #noqa: F401 from .qwen3_moe import Qwen3MoeForCausalLM #noqa: F401 + from .qwen3_vl import Qwen3VLForConditionalGeneration + from .qwen3_vl_moe import Qwen3VLMoeForConditionalGeneration + from .qwen3_omni_moe_thinker import Qwen3OmniMoeThinkerForConditionalGeneration + # from .llama4 import Llama4ForCausalLM #noqa: F401 + # from .mllama4 import Llama4ForConditionalGeneration #noqa: F401 + # from .deepseek_v2 import KunlunDeepseekV2MoE # ModelRegistry.register_model( # "DemoModel", @@ -27,6 +33,10 @@ def register_model(): ModelRegistry.register_model( "Qwen3MoeForCausalLM", "vllm_kunlun.models.qwen3_moe:Qwen3MoeForCausalLM") + + ModelRegistry.register_model( + "Qwen3NextForCausalLM", + "vllm_kunlun.models.qwen3_next:Qwen3NextForCausalLM") ModelRegistry.register_model( "GlmForCausalLM", @@ -34,7 +44,8 @@ def register_model(): ModelRegistry.register_model( "GptOssForCausalLM", - "vllm_kunlun.models.gpt_oss:GptOssForCausalLM") + "vllm_kunlun.models.gpt_oss:GptOssForCausalLM") + ModelRegistry.register_model( "InternLM2ForCausalLM", "vllm_kunlun.models.internlm2:InternLM2ForCausalLM") @@ -52,16 +63,20 @@ def register_model(): "vllm_kunlun.models.interns1:InternS1ForConditionalGeneration") ModelRegistry.register_model( - "Glm4MoeForCausalLM", - "vllm_kunlun.models.glm4_moe:Glm4MoeForCausalLM") + "Qwen3VLForConditionalGeneration", + "vllm_kunlun.models.qwen3_vl:Qwen3VLForConditionalGeneration") ModelRegistry.register_model( - "Glm4ForCausalLM", - "vllm_kunlun.models.glm4:Glm4ForCausalLM") + "Qwen3VLMoeForConditionalGeneration", + "vllm_kunlun.models.qwen3_vl_moe:Qwen3VLMoeForConditionalGeneration") ModelRegistry.register_model( - "Glm4vForConditionalGeneration", - "vllm_kunlun.models.glm4_1v:Glm4vForConditionalGeneration") + "Qwen3OmniMoeForConditionalGeneration", + "vllm_kunlun.models.qwen3_omni_moe_thinker:Qwen3OmniMoeThinkerForConditionalGeneration") + + ModelRegistry.register_model( + "SeedOssForCausalLM", + "vllm_kunlun.models.seed_oss:SeedOssForCausalLM") def register_quant_method(): diff --git a/vllm_kunlun/models/glm4.py b/vllm_kunlun/models/glm4.py deleted file mode 100644 index 816ab6e..0000000 --- a/vllm_kunlun/models/glm4.py +++ /dev/null @@ -1,301 +0,0 @@ -# -# Copyright (c) 2025 Baidu, Inc. All Rights Reserved. -# Adapted from vllm/model_executor/models/glm4.py -# Copyright 2023 The vLLM team. -# -# This file is a part of the vllm-kunlun project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Inference-only GLM-4-0414 model compatible with HuggingFace weights.""" -from collections.abc import Iterable -from typing import Optional, Union - -import torch -from torch import nn -from transformers import Glm4Config - -from vllm.attention import AttentionType -from vllm_kunlun.ops.attention.layer import Attention -from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig -from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import (QKVParallelLinear, - RowParallelLinear) -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors - -from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP -from vllm_kunlun.models.llama import LlamaMLP as Glm4MLP -from vllm_kunlun.models.llama import LlamaModel -from vllm.model_executor.models.utils import AutoWeightsLoader, PPMissingLayer, maybe_prefix - - -class Glm4Attention(nn.Module): - - def __init__(self, - config: Glm4Config, - hidden_size: int, - num_heads: int, - num_kv_heads: int, - max_position: int = 4096 * 32, - head_dim: Optional[int] = None, - qkv_bias: bool = False, - rope_theta: float = 10000, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - rope_scaling: Optional[tuple] = None, - prefix: str = "", - attn_type: str = AttentionType.DECODER) -> None: - super().__init__() - self.hidden_size = hidden_size - tp_size = get_tensor_model_parallel_world_size() - self.total_num_heads = num_heads - assert self.total_num_heads % tp_size == 0 - self.num_heads = self.total_num_heads // tp_size - self.total_num_kv_heads = num_kv_heads - if self.total_num_kv_heads >= tp_size: - # Number of KV heads is greater than TP size, so we partition - # the KV heads across multiple tensor parallel GPUs. - assert self.total_num_kv_heads % tp_size == 0 - else: - # Number of KV heads is less than TP size, so we replicate - # the KV heads across multiple tensor parallel GPUs. - assert tp_size % self.total_num_kv_heads == 0 - partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5) - self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) - self.head_dim = head_dim or hidden_size // self.total_num_heads - self.rotary_dim = self.head_dim - self.q_size = self.num_heads * self.head_dim - self.kv_size = self.num_kv_heads * self.head_dim - self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta - self.qkv_proj = QKVParallelLinear( - hidden_size, - self.head_dim, - self.total_num_heads, - self.total_num_kv_heads, - bias=qkv_bias, - quant_config=quant_config, - prefix=f"{prefix}.qkv_proj", - ) - self.o_proj = RowParallelLinear( - self.total_num_heads * self.head_dim, - hidden_size, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.o_proj", - ) - self.rotary_emb = get_rope( - self.head_dim, - rotary_dim=self.rotary_dim, - max_position=max_position, - base=self.rope_theta, - rope_scaling=rope_scaling, - partial_rotary_factor=partial_rotary_factor, - is_neox_style=False, - ) - self.attn = Attention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.attn", - attn_type=attn_type) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - ) -> torch.Tensor: - qkv, _ = self.qkv_proj(hidden_states) - q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - q, k = self.rotary_emb(positions, q, k) - attn_output = self.attn(q, k, v) - output, _ = self.o_proj(attn_output) - return output - - -class Glm4DecoderLayer(nn.Module): - - def __init__( - self, - config: Glm4Config, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: - super().__init__() - self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 1000000) - rope_scaling = getattr(config, "rope_scaling", None) - - self.self_attn = Glm4Attention( - config=config, - hidden_size=self.hidden_size, - num_heads=config.num_attention_heads, - max_position=config.max_position_embeddings, - num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - qkv_bias=getattr(config, 'attention_bias', False), - head_dim=getattr(config, 'head_dim', None), - cache_config=cache_config, - quant_config=quant_config, - rope_scaling=rope_scaling, - prefix=f"{prefix}.self_attn", - attn_type=AttentionType.DECODER, - ) - self.mlp = Glm4MLP( - hidden_size=self.hidden_size, - intermediate_size=config.intermediate_size, - hidden_act=config.hidden_act, - quant_config=quant_config, - prefix=f"{prefix}.mlp", - ) - self.input_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - self.post_attention_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - self.post_self_attn_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - self.post_mlp_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - residual: Optional[torch.Tensor], - ) -> tuple[torch.Tensor, torch.Tensor]: - # Self Attention - if residual is None: - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - else: - hidden_states, residual = self.input_layernorm( - hidden_states, residual) - hidden_states = self.self_attn( - positions=positions, - hidden_states=hidden_states, - ) - - hidden_states = self.post_self_attn_layernorm(hidden_states) - - # Fully Connected - hidden_states, residual = self.post_attention_layernorm( - hidden_states, residual) - hidden_states = self.mlp(hidden_states) - hidden_states = self.post_mlp_layernorm(hidden_states) - - return hidden_states, residual - - -ALL_DECODER_LAYER_TYPES = { - "attention": Glm4DecoderLayer, -} - - -@support_torch_compile( - dynamic_arg_dims={ - "input_ids": 0, - "positions": -1, - "intermediate_tensors": 0, - "inputs_embeds": 0, - }) -class Glm4Model(LlamaModel): - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__(vllm_config=vllm_config, - prefix=prefix, - layer_type=Glm4DecoderLayer) - - -class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config - - self.config = config - self.lora_config = lora_config - - self.quant_config = quant_config - self.model = Glm4Model(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model")) - - if get_pp_group().is_last_rank: - if config.tie_word_embeddings: - self.lm_head = self.model.embed_tokens - else: - self.lm_head = ParallelLMHead(config.vocab_size, - config.hidden_size, - quant_config=quant_config, - prefix=maybe_prefix( - prefix, "lm_head")) - else: - self.lm_head = PPMissingLayer() - - self.logits_processor = LogitsProcessor(config.vocab_size) - - self.make_empty_intermediate_tensors = ( - self.model.make_empty_intermediate_tensors) - - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.model.get_input_embeddings(input_ids) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, IntermediateTensors]: - hidden_states = self.model(input_ids, positions, intermediate_tensors, - inputs_embeds) - return hidden_states - - def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) - return logits - - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader( - self, - skip_prefixes=(["lm_head."] - if self.config.tie_word_embeddings else None), - ) - return loader.load_weights(weights) diff --git a/vllm_kunlun/models/glm4_1v.py b/vllm_kunlun/models/glm4_1v.py deleted file mode 100644 index dfd5ecf..0000000 --- a/vllm_kunlun/models/glm4_1v.py +++ /dev/null @@ -1,1597 +0,0 @@ -# -# Copyright (c) 2025 Baidu, Inc. All Rights Reserved. -# Adapted from vllm/model_executor/models/glm4_1v.py -# Copyright 2023 The vLLM team. -# -# This file is a part of the vllm-kunlun project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Inference-only GLM-4V model compatible with HuggingFace weights.""" - -import math -from collections.abc import Iterable, Mapping, Sequence -from functools import partial -from typing import Annotated, Any, Callable, Literal, Optional, Union - -import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F -from einops import rearrange -from transformers import BatchFeature -from transformers.models.glm4v.configuration_glm4v import Glm4vVisionConfig -from transformers.models.glm4v.image_processing_glm4v import ( - Glm4vImageProcessor, smart_resize) -from transformers.models.glm4v.video_processing_glm4v import ( - Glm4vVideoProcessor) -from transformers.video_utils import VideoMetadata - -from vllm.config import VllmConfig -from vllm.distributed import parallel_state -from vllm.distributed import utils as dist_utils -from vllm.logger import init_logger -from vllm.model_executor import SamplingMetadata -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import (ColumnParallelLinear, - MergedColumnParallelLinear, - QKVParallelLinear, - RowParallelLinear) -from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.module_mapping import MultiModelKeys -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs, VideoItem) -from vllm.multimodal.parse import (ImageSize, MultiModalDataItems, - MultiModalDataParser) -from vllm.multimodal.processing import (BaseMultiModalProcessor, - BaseProcessingInfo, PromptReplacement, - PromptUpdate, PromptUpdateDetails) -from vllm.multimodal.profiling import BaseDummyInputsBuilder -from vllm.platforms import _Backend -from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.config import uses_mrope -from vllm.utils.tensor_schema import TensorSchema, TensorShape - -from vllm_kunlun.ops.activation import SiluAndMul -from vllm.model_executor.models.interfaces import (MultiModalEmbeddings, SupportsLoRA, - SupportsMultiModal, SupportsPP) -from vllm_kunlun.models.qwen2_vl import _qwen2vl_field_config, apply_rotary_pos_emb_vision -from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper, - init_vllm_registered_model, maybe_prefix, - merge_multimodal_embeddings) -from vllm.model_executor.models.vision import get_vit_attn_backend - -logger = init_logger(__name__) - -# For profile run -_MAX_FRAMES_PER_VIDEO = 600 - -# === Vision Inputs === # - -import torch -import torch.nn.functional as F - -def grid_sample(input, grid, **kwargs): - try: - return F.grid_sample(input, grid, **kwargs) - except RuntimeError: - # if grid_sample is not implemented on XPU, falling back to CPU. - result = F.grid_sample(input.cpu(), grid.cpu(), **kwargs).to(device=input.device, dtype=input.dtype).contiguous() - return result - -class Glm4vImagePixelInputs(TensorSchema): - """ - Dimensions: - - np: Number of patches - - cpp: Number of channels * patch_size * patch_size - - ni: Number of images - - g: Grid dimensions (3 for grid_t, grid_h, grid_w) - """ - type: Literal["pixel_values"] = "pixel_values" - - pixel_values: Annotated[torch.Tensor, TensorShape("np", "cpp")] - image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)] - - -class Glm4vImageEmbeddingInputs(TensorSchema): - """ - Dimensions: - - f: Number of image features (varies based on image resolution) - - h: Hidden size (must match language model backbone) - - n: Number of images - - g: Grid dimensions (3 for grid_t, grid_h, grid_w) - """ - type: Literal["image_embeds"] = "image_embeds" - - image_embeds: Annotated[torch.Tensor, TensorShape("f", "h")] - image_grid_thw: Annotated[torch.Tensor, TensorShape("n", 3)] - - -Glm4vImageInputs = Union[Glm4vImagePixelInputs, Glm4vImageEmbeddingInputs] - - -class Glm4vVideoPixelInputs(TensorSchema): - """ - Dimensions: - - np: Number of patches - - ctpp: Number of channels * temporal_patch_size * - patch_size * patch_size - - f: Number of frames - - g: Grid dimensions (3 for grid_t which is usually 1 for processed - video, grid_h, grid_w) - """ - type: Literal["pixel_values_videos"] = "pixel_values_videos" - - pixel_values_videos: Annotated[torch.Tensor, TensorShape("np", "ctpp")] - video_grid_thw: Annotated[torch.Tensor, TensorShape("f", 3)] - - -class Glm4vVideoEmbeddingInputs(TensorSchema): - """ - Dimensions: - - p: Number of video patches across all frames - - h: Hidden size (must match language model backbone) - - f: Number of frames - - g: Grid dimensions (3 for grid_t which is usually 1 for processed - video, grid_h, grid_w) - """ - type: Literal["video_embeds"] = "video_embeds" - - video_embeds: Annotated[torch.Tensor, TensorShape("p", "h")] - video_grid_thw: Annotated[torch.Tensor, TensorShape("f", 3)] - - -Glm4vVideoInputs = Union[Glm4vVideoPixelInputs, Glm4vVideoEmbeddingInputs] - -# === Vision Encoder === # - - -class Glm4vVisionMLP(nn.Module): - - def __init__( - self, - in_features: int, - hidden_features: int, - bias: bool = False, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): - super().__init__() - self.gate_up_proj = MergedColumnParallelLinear( - input_size=in_features, - output_sizes=[hidden_features] * 2, - bias=bias, - quant_config=quant_config, - prefix=f"{prefix}.gate_up_proj") - self.down_proj = RowParallelLinear(hidden_features, - in_features, - bias=bias, - quant_config=quant_config, - prefix=f"{prefix}.down_proj") - self.act_fn = SiluAndMul() - - def forward(self, x: torch.Tensor): - x, _ = self.gate_up_proj(x) - x = self.act_fn(x) - x, _ = self.down_proj(x) - return x - - -def all_gather_interleave(local_tensor, hidden_size: int, tp_size: int): - """All-gather the input tensor interleavely across model parallel group.""" - import torch.distributed as dist - - gathered_tensors = [torch.zeros_like(local_tensor) for _ in range(tp_size)] - dist.all_gather( - gathered_tensors, - local_tensor, - group=parallel_state.get_tp_group().device_group, - ) - - gathered_tensors_split = [ - torch.split(tensor, hidden_size // tp_size, -1) - for tensor in gathered_tensors - ] - ordered_tensors = [ - tensor for pair in zip(*gathered_tensors_split) for tensor in pair - ] - result_tensor = torch.cat(ordered_tensors, dim=-1) - return result_tensor - - -class Glm4vVisionAttention(nn.Module): - - def __init__( - self, - embed_dim: int, - num_heads: int, - projection_size: int, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: - super().__init__() - # Per attention head and per partition values. - self.tp_size = parallel_state.get_tensor_model_parallel_world_size() - self.tp_rank = parallel_state.get_tensor_model_parallel_rank() - self.hidden_size_per_attention_head = dist_utils.divide( - projection_size, num_heads) - self.num_attention_heads_per_partition = dist_utils.divide( - num_heads, self.tp_size) - - self.qkv = QKVParallelLinear( - hidden_size=embed_dim, - head_size=self.hidden_size_per_attention_head, - total_num_heads=num_heads, - total_num_kv_heads=num_heads, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.qkv", - ) - self.proj = RowParallelLinear( - input_size=projection_size, - output_size=embed_dim, - quant_config=quant_config, - prefix=f"{prefix}.proj", - bias=False, - ) - - # Detect attention implementation. - self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True) - if self.attn_backend not in { - _Backend.FLASH_ATTN, - _Backend.TORCH_SDPA, - _Backend.XFORMERS, - }: - raise RuntimeError( - f"GLM-4V does not support {self.attn_backend} backend now.") - - def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]: - # [s, b, 3 * head * head_dim] - seq_len, bs, _ = qkv.shape - if self.tp_size > 1: - qkv = all_gather_interleave(qkv, self.qkv.hidden_size, - self.tp_size) - - # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim] - q, k, v = qkv.chunk(3, dim=2) - - # 3 * [s, b, head * head_dim] - if self.tp_size > 1: - splitter = partial( - dist_utils.split_tensor_along_last_dim, - num_partitions=self.tp_size, - ) - q = splitter(q)[self.tp_rank] - k = splitter(k)[self.tp_rank] - v = splitter(v)[self.tp_rank] - - # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim] - new_shape = ( - seq_len, - bs, - self.num_attention_heads_per_partition, - self.hidden_size_per_attention_head, - ) - q, k, v = (x.view(*new_shape) for x in (q, k, v)) - return q, k, v - - def forward( - self, - x: torch.Tensor, - cu_seqlens: torch.Tensor, - rotary_pos_emb: torch.Tensor, - max_seqlen: Optional[int] = None, # Only used for Flash Attention - seqlens: Optional[list[int]] = None, # Only used for xFormers - ) -> torch.Tensor: - # [s, b, c] --> [s, b, head * 3 * head_dim] - x, _ = self.qkv(x) - - # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim] - q, k, v = self.split_qkv(x) - batch_size = q.shape[1] - - q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() - for x in (q, k, v)) - if rotary_pos_emb is not None: - q = apply_rotary_pos_emb_vision(q, rotary_pos_emb) - k = apply_rotary_pos_emb_vision(k, rotary_pos_emb) - - if self.attn_backend == _Backend.FLASH_ATTN: - # from vllm_flash_attn.flash_attn_interface import ( - # flash_attn_varlen_func) - from flash_attn import flash_attn_varlen_func - - q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) - - output = flash_attn_varlen_func( - q, - k, - v, - cu_seqlens_q=cu_seqlens, - cu_seqlens_k=cu_seqlens, - max_seqlen_q=max_seqlen, - max_seqlen_k=max_seqlen, - dropout_p=0, - causal=False, - ) - - context_layer = rearrange(output, - "(b s) ... -> b s ...", - b=batch_size) - elif self.attn_backend == _Backend.TORCH_SDPA: - # Execute attention entry by entry for speed & less VRAM. - outputs = [] - for i in range(1, len(cu_seqlens)): - start_idx = cu_seqlens[i - 1] - end_idx = cu_seqlens[i] - q_i = q[:, start_idx:end_idx] - k_i = k[:, start_idx:end_idx] - v_i = v[:, start_idx:end_idx] - q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d") - for x in [q_i, k_i, v_i]) - output_i = F.scaled_dot_product_attention(q_i, - k_i, - v_i, - dropout_p=0.0) - output_i = rearrange(output_i, "b h s d -> b s h d ") - outputs.append(output_i) - context_layer = torch.cat(outputs, dim=1) - elif self.attn_backend == _Backend.XFORMERS: - from xformers import ops as xops - from xformers.ops.fmha.attn_bias import BlockDiagonalMask - - attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens, - kv_seqlen=None, - device=q.device) - - context_layer = xops.memory_efficient_attention_forward( - q, k, v, attn_bias=attn_bias, p=0, scale=None) - - context_layer = rearrange(context_layer, - "b s h d -> s b (h d)").contiguous() - - output, _ = self.proj(context_layer) - return output - - -class Glm4vVisionBlock(nn.Module): - - def __init__( - self, - dim: int, - num_heads: int, - mlp_hidden_dim: int, - norm_layer: Optional[Callable[[int], nn.Module]] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: - super().__init__() - if norm_layer is None: - norm_layer = partial(nn.LayerNorm, eps=1e-6) - self.norm1 = norm_layer(dim) - self.norm2 = norm_layer(dim) - self.attn = Glm4vVisionAttention( - embed_dim=dim, - num_heads=num_heads, - projection_size=dim, - quant_config=quant_config, - prefix=f"{prefix}.attn", - ) - self.mlp = Glm4vVisionMLP( - dim, - mlp_hidden_dim, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.mlp", - ) - - def forward( - self, - x: torch.Tensor, - cu_seqlens: torch.Tensor, - rotary_pos_emb: torch.Tensor, - max_seqlen: Optional[int] = None, # Only used for Flash Attention - seqlens: Optional[list[int]] = None, # Only used for xFormers - ) -> torch.Tensor: - x = x + self.attn( - self.norm1(x), - cu_seqlens=cu_seqlens, - rotary_pos_emb=rotary_pos_emb, - max_seqlen=max_seqlen, - seqlens=seqlens, - ) - - x = x + self.mlp(self.norm2(x)) - return x - - -class Glm4vVisionPatchEmbed(nn.Module): - - def __init__( - self, - patch_size: int = 14, - temporal_patch_size: int = 1, - in_channels: int = 3, - hidden_size: int = 1536, - ) -> None: - super().__init__() - self.patch_size = patch_size - self.temporal_patch_size = temporal_patch_size - self.hidden_size = hidden_size - - kernel_size = (temporal_patch_size, patch_size, patch_size) - self.proj = nn.Conv3d( - in_channels, - hidden_size, - kernel_size=kernel_size, - stride=kernel_size, - bias=True, - ) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - L, C = x.shape - x = x.view(L, -1, self.temporal_patch_size, self.patch_size, - self.patch_size) - x = self.proj(x).view(L, self.hidden_size) - return x - - -class Glm4vPatchMerger(nn.Module): - - def __init__( - self, - d_model: int, - context_dim: int, - quant_config: Optional[QuantizationConfig] = None, - bias: bool = False, - prefix: str = "", - ) -> None: - super().__init__() - self.hidden_size = d_model - self.proj = ColumnParallelLinear(self.hidden_size, - self.hidden_size, - bias=bias, - gather_output=True, - quant_config=quant_config, - prefix=f"{prefix}.proj") - self.post_projection_norm = nn.LayerNorm(self.hidden_size) - self.gate_up_proj = MergedColumnParallelLinear( - input_size=self.hidden_size, - output_sizes=[context_dim] * 2, - bias=bias, - quant_config=quant_config, - prefix=f"{prefix}.gate_up_proj", - ) - self.down_proj = RowParallelLinear( - context_dim, - self.hidden_size, - bias=bias, - quant_config=quant_config, - prefix=f"{prefix}.down_proj", - ) - self.act_fn = SiluAndMul() - self.extra_activation_func = nn.GELU() - - def forward(self, x: torch.Tensor): - x, _ = self.proj(x) - x = self.extra_activation_func(self.post_projection_norm(x)) - gate_up, _ = self.gate_up_proj(x) - x = self.act_fn(gate_up) - x, _ = self.down_proj(x) - return x - - -class Glm4vVisionEmbeddings(nn.Module): - - def __init__(self, config: Glm4vVisionConfig): - super().__init__() - self.config = config - self.embed_dim = config.hidden_size - self.image_size = config.image_size - self.patch_size = config.patch_size - - self.num_patches = (self.image_size // self.patch_size)**2 - self.num_positions = self.num_patches - self.position_embedding = nn.Embedding(self.num_positions, - self.embed_dim) - self.register_buffer( - "position_ids", - torch.arange(self.num_positions).expand((1, -1)), - persistent=False, - ) - - def forward(self, embeddings, lengths, image_shapes, h_coords, - w_coords) -> torch.Tensor: - pos_embed_weight = self.position_embedding.weight - hidden_size = pos_embed_weight.shape[1] - total_seq = h_coords.shape[0] - device = pos_embed_weight.device - - # Move coordinates to correct device - h_coords, w_coords = h_coords.to(device), w_coords.to(device) - - # Handle empty sequence case - if total_seq == 0: - adapted_pos_embed = torch.empty(0, - hidden_size, - device=device, - dtype=pos_embed_weight.dtype) - else: - # Convert inputs to tensors if needed - if isinstance(lengths, list): - lengths = torch.tensor(lengths, - device=device, - dtype=torch.long) - if not isinstance(image_shapes, torch.Tensor): - image_shapes = torch.tensor(image_shapes, - device=device, - dtype=torch.long) - - # Prepare 2D position embedding - orig_size_sq = pos_embed_weight.shape[0] - orig_size = int(orig_size_sq**0.5) - pos_embed_2d = (pos_embed_weight.view( - orig_size, orig_size, - hidden_size).permute(2, 0, - 1).unsqueeze(0).to(device=device, - dtype=torch.float32)) - - # Calculate target dimensions for each patch - target_h = torch.cat([ - image_shapes[i, 1].repeat(lengths[i]) - for i in range(len(lengths)) - ]).to(device=device, dtype=torch.float32) - target_w = torch.cat([ - image_shapes[i, 2].repeat(lengths[i]) - for i in range(len(lengths)) - ]).to(device=device, dtype=torch.float32) - - # Normalize coordinates to [-1, 1] range for grid_sample - h_coords = h_coords.to(device=device, dtype=torch.float32) - w_coords = w_coords.to(device=device, dtype=torch.float32) - norm_w = ((w_coords + 0.5) / target_w) * 2 - 1 - norm_h = ((h_coords + 0.5) / target_h) * 2 - 1 - - # Create sampling grid - grid = (torch.stack((norm_w, norm_h), - dim=-1).unsqueeze(0).unsqueeze(2)) - - # Perform bicubic interpolation - interpolated_embed_fp32 = grid_sample( - pos_embed_2d, - grid, - mode="bicubic", - align_corners=False, - padding_mode="border", - ) - - # Reshape and convert back to original dtype - adapted_pos_embed_fp32 = ( - interpolated_embed_fp32.squeeze(0).squeeze(-1).permute(1, 0)) - adapted_pos_embed = adapted_pos_embed_fp32.to( - pos_embed_weight.dtype).to(embeddings.device) - - # Add adapted position encoding to embeddings - embeddings = embeddings + adapted_pos_embed - return embeddings - - -class Glm4vVisionRotaryEmbedding(nn.Module): - - def __init__(self, dim: int, theta: float = 10000.0) -> None: - super().__init__() - self.dim = dim - self.theta = theta - inv_freq = 1.0 / (theta - **(torch.arange(0, dim, 2, dtype=torch.float) / dim)) - self.register_buffer("inv_freq", inv_freq, persistent=False) - self._seq_len_cached = 0 - self._freqs_cached = None - - def update_freqs_cache(self, seqlen: int) -> None: - if seqlen > self._seq_len_cached: - seqlen *= 2 - self._seq_len_cached = seqlen - self.inv_freq = 1.0 / (self.theta**(torch.arange( - 0, - self.dim, - 2, - dtype=torch.float, - device=self.inv_freq.device, - ) / self.dim)) - seq = torch.arange(seqlen, - device=self.inv_freq.device, - dtype=self.inv_freq.dtype) - freqs = torch.outer(seq, self.inv_freq) - self._freqs_cached = freqs - - def forward(self, seqlen: int) -> torch.Tensor: - self.update_freqs_cache(seqlen) - return self._freqs_cached[:seqlen] - - -class Glm4vVisionTransformer(nn.Module): - - def __init__( - self, - vision_config: Glm4vVisionConfig, - norm_eps: float = 1e-6, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: - super().__init__() - - patch_size = vision_config.patch_size - temporal_patch_size = vision_config.temporal_patch_size - in_channels = vision_config.in_channels - depth = vision_config.depth - self.hidden_size = vision_config.hidden_size - self.num_heads = vision_config.num_heads - - self.patch_size = vision_config.patch_size - self.spatial_merge_size = vision_config.spatial_merge_size - self.out_hidden_size = vision_config.out_hidden_size - - self.patch_embed = Glm4vVisionPatchEmbed( - patch_size=patch_size, - temporal_patch_size=temporal_patch_size, - in_channels=in_channels, - hidden_size=self.hidden_size, - ) - - norm_layer = partial(RMSNorm, eps=norm_eps) - head_dim = self.hidden_size // self.num_heads - self.rotary_pos_emb = Glm4vVisionRotaryEmbedding(head_dim // 2) - self.blocks = nn.ModuleList([ - Glm4vVisionBlock( - dim=self.hidden_size, - num_heads=self.num_heads, - mlp_hidden_dim=vision_config.out_hidden_size, - norm_layer=norm_layer, - quant_config=quant_config, - prefix=f"{prefix}.blocks.{layer_idx}", - ) for layer_idx in range(depth) - ]) - self.merger = Glm4vPatchMerger( - d_model=vision_config.out_hidden_size, - context_dim=vision_config.intermediate_size, - quant_config=quant_config, - bias=False, - prefix=f"{prefix}.merger", - ) - self.embeddings = Glm4vVisionEmbeddings(vision_config) - - self.post_conv_layernorm = RMSNorm(vision_config.hidden_size, - eps=vision_config.rms_norm_eps) - self.downsample = nn.Conv2d( - in_channels=vision_config.hidden_size, - out_channels=vision_config.out_hidden_size, - kernel_size=vision_config.spatial_merge_size, - stride=vision_config.spatial_merge_size, - ) - self.post_layernorm = RMSNorm(vision_config.hidden_size, - eps=vision_config.rms_norm_eps) - - self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True) - - @property - def dtype(self) -> torch.dtype: - return self.patch_embed.proj.weight.dtype - - @property - def device(self) -> torch.device: - return self.patch_embed.proj.weight.device - - def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor: - pos_ids = [] - for t, h, w in grid_thw: - hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) - wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) - hpos_ids = (hpos_ids.reshape( - h // self.spatial_merge_size, - self.spatial_merge_size, - w // self.spatial_merge_size, - self.spatial_merge_size, - ).permute(0, 2, 1, 3).flatten()) - wpos_ids = (wpos_ids.reshape( - h // self.spatial_merge_size, - self.spatial_merge_size, - w // self.spatial_merge_size, - self.spatial_merge_size, - ).permute(0, 2, 1, 3).flatten()) - pos_ids.append( - torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) - pos_ids = torch.cat(pos_ids, dim=0) - max_grid_size = grid_thw[:, 1:].max() - rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) - rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) - return rotary_pos_emb, pos_ids - - def compute_attn_mask_seqlen( - self, - cu_seqlens: torch.Tensor, - ) -> tuple[Optional[int], Optional[list[int]]]: - max_seqlen, seqlens = None, None - seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() - if self.attn_backend == _Backend.FLASH_ATTN: - max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() - return max_seqlen, seqlens - - def forward( - self, - x: torch.Tensor, - grid_thw: torch.Tensor, - ) -> torch.Tensor: - # patchify - x = x.to(device=self.device, dtype=self.dtype) - x = self.patch_embed(x) - x = self.post_conv_layernorm(x) - - # compute position embedding - rotary_pos_emb, image_type_ids = self.rot_pos_emb(grid_thw) - # compute cu_seqlens - cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], - grid_thw[:, 0]).cumsum( - dim=0, dtype=torch.int32) - cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0) - - # pre-compute seqlens for attn mask to reduce cuMemcpy operations - max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens) - x = self.embeddings(x, seqlens, grid_thw, image_type_ids[:, 0], - image_type_ids[:, 1]) - - # transformers - x = x.unsqueeze(1) - for blk in self.blocks: - x = blk( - x, - cu_seqlens=cu_seqlens, - rotary_pos_emb=rotary_pos_emb, - max_seqlen=max_seqlen, - seqlens=seqlens, - ) - - # adapter - x = self.post_layernorm(x) - - x = x.view(-1, self.spatial_merge_size, self.spatial_merge_size, - x.shape[-1]) - x = x.permute(0, 3, 1, 2) - x = self.downsample(x).view(-1, self.out_hidden_size) - x = self.merger(x) - - return x - - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("attn.qkv.", "attn.q.", "q"), - ("attn.qkv.", "attn.k.", "k"), - ("attn.qkv.", "attn.v.", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: set[str] = set() - - for name, loaded_weight in weights: - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - - -class Glm4vProcessingInfo(BaseProcessingInfo): - - def get_hf_config(self): - return self.ctx.get_hf_config() - - def get_tokenizer(self): - return self.ctx.tokenizer - - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": None, "video": 1} - - def get_image_processor(self, **kwargs: object) -> Glm4vImageProcessor: - return self.get_hf_processor(**kwargs).image_processor - - def get_video_processor(self, **kwargs: object) -> Glm4vVideoProcessor: - return self.get_hf_processor(**kwargs).video_processor - - def _get_vision_info( - self, - *, - image_width: int, - image_height: int, - num_frames: int = 16, - do_resize: bool = True, - max_image_pixels: int = 28 * 28 * 2 * 30000, - ) -> tuple[ImageSize, int]: - hf_config = self.get_hf_config() - vision_config = hf_config.vision_config - patch_size = vision_config.patch_size - merge_size = vision_config.spatial_merge_size - temporal_patch_size = vision_config.temporal_patch_size - if do_resize: - resized_height, resized_width = smart_resize( - num_frames=num_frames - if num_frames > temporal_patch_size else temporal_patch_size, - height=image_height, - width=image_width, - factor=patch_size * merge_size, - max_pixels=max_image_pixels, - ) - preprocessed_size = ImageSize(width=resized_width, - height=resized_height) - else: - preprocessed_size = ImageSize(width=image_width, - height=image_height) - - # NOTE: Frames are padded to be divisible by `temporal_patch_size` - # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py#L294 - padded_num_frames = num_frames + num_frames % temporal_patch_size - - grid_t = max(padded_num_frames // temporal_patch_size, 1) - grid_h = preprocessed_size.height // patch_size - grid_w = preprocessed_size.width // patch_size - - num_patches = grid_t * grid_h * grid_w - num_vision_tokens = num_patches // (merge_size**2) - - return preprocessed_size, num_vision_tokens - - def get_image_size_with_most_features(self) -> ImageSize: - max_image_size, _ = self._get_vision_info(image_width=9999999, - image_height=9999999) - return max_image_size - - def get_num_image_tokens( - self, - *, - image_width: int, - image_height: int, - ) -> int: - _, num_image_tokens = self._get_vision_info( - image_width=image_width, - image_height=image_height, - max_image_pixels=28 * 28 * 2 * 6144, - ) - return num_image_tokens - - def get_max_image_tokens(self) -> int: - target_width, target_height = self.get_image_size_with_most_features() - - return self.get_num_image_tokens( - image_width=target_width, - image_height=target_height, - ) - - def get_num_video_tokens( - self, - *, - image_width: int, - image_height: int, - num_frames: int, - ) -> int: - _, num_video_tokens = self._get_vision_info( - image_width=image_width, - image_height=image_height, - num_frames=num_frames, - max_image_pixels=28 * 28 * 2 * 30000, - ) - return num_video_tokens - - def _get_max_video_frames(self, max_tokens: int) -> int: - target_width, target_height = self.get_image_size_with_most_features() - - num_frames = 0 - - while True: - next_num_frames = num_frames + 1 - next_max_tokens = self.get_num_video_tokens( - image_width=target_width, - image_height=target_height, - num_frames=next_num_frames, - ) - if next_max_tokens > max_tokens or next_max_tokens == 0: - break - - num_frames = next_num_frames - - return num_frames - - def get_num_frames_with_most_features( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> int: - max_images = mm_counts.get("image", 0) - max_videos = mm_counts.get("video", 0) - - max_image_tokens = self.get_max_image_tokens() * max_images - max_total_frames = self._get_max_video_frames(seq_len - - max_image_tokens) - max_frames_per_video = min(max_total_frames // max(max_videos, 1), - _MAX_FRAMES_PER_VIDEO) - - return max(max_frames_per_video, 1) - - def _get_video_second_idx(self, metadata: dict[str, Any], - total_frames: int) -> list[int]: - video_processor = self.get_video_processor() - - video_fps = metadata.get("fps", video_processor.fps) - meta_frames = metadata.get("total_num_frames", total_frames) - max_frame_idx = meta_frames - 1 - duration = metadata.get("duration", - round(max_frame_idx / video_fps) + 1) - if duration <= video_processor.max_duration: - n = int(math.floor(duration * video_processor.fps)) - frame_indices = [ - min( - max_frame_idx, - int(math.ceil(i * video_fps / video_processor.fps)), - ) for i in range(n) - ] - else: - num_samples = int(video_processor.max_duration * - video_processor.fps) - if num_samples >= meta_frames: - frame_indices = list(range(meta_frames)) - else: - target_seconds = np.linspace(0, - duration, - num_samples, - endpoint=True) - frame_indices = [ - min(max_frame_idx, int(math.ceil(t * video_fps))) - for t in target_seconds - ] - - seen, uniq = set(), [] - for idx in frame_indices: - if idx not in seen: - seen.add(idx) - uniq.append(idx) - if len(uniq) & 1: - uniq.append(uniq[-1]) - frame_indices = uniq - - full_second_idxs = [int(idx / video_fps) for idx in frame_indices] - timestamps_list = full_second_idxs[::2] - selected_timestamps = [] - for idx in range(0, len(timestamps_list)): - selected_timestamps.append(timestamps_list[idx]) - return selected_timestamps - - -class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]): - - def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: - num_images = mm_counts.get("image", 0) - num_videos = mm_counts.get("video", 0) - - hf_config = self.info.get_hf_config() - hf_processor = self.info.get_hf_processor() - tokenizer = self.info.get_tokenizer() - - image_token: str = hf_processor.image_token - video_token_ids = [ - hf_config.video_start_token_id, - hf_processor.video_token_id, - hf_config.video_end_token_id, - ] - video_token = tokenizer.decode(video_token_ids) - - return image_token * num_images + video_token * num_videos - - def get_dummy_mm_data( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> MultiModalDataDict: - num_images = mm_counts.get("image", 0) - num_videos = mm_counts.get("video", 0) - - target_width, target_height = ( - self.info.get_image_size_with_most_features()) - target_num_frames = self.info.get_num_frames_with_most_features( - seq_len, mm_counts) - return { - "image": - self._get_dummy_images(width=target_width, - height=target_height, - num_images=num_images), - "video": - self._get_dummy_videos( - width=target_width, - height=target_height, - num_frames=target_num_frames, - num_videos=num_videos, - ), - } - - def _get_dummy_videos( - self, - *, - width: int, - height: int, - num_frames: int, - num_videos: int, - ) -> list[VideoItem]: - video = np.full((num_frames, width, height, 3), 255, dtype=np.uint8) - video_items = [] - for i in range(num_videos): - video_metadata = { - "fps": 2.0, - "duration": num_frames / 2.0, - "total_num_frames": num_frames, - "video_backend": "opencv", - } - video_item = (video.copy(), video_metadata) - video_items.append(video_item) - - return video_items - - -class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]): - - def _get_data_parser(self) -> MultiModalDataParser: - return MultiModalDataParser(video_needs_metadata=True) - - def _call_hf_processor( - self, - prompt: str, - mm_data: Mapping[str, object], - mm_kwargs: Mapping[str, object], - tok_kwargs: Mapping[str, object], - ) -> BatchFeature: - mm_data = dict(mm_data) - processor = self.info.get_hf_processor(**mm_kwargs) - - # GLM-4.1V use `image_token_id` as video placeholder, we need to - # replace it with `video_token_id` for video processing. So we - # separate video processing from image processing. - if ("videos" in mm_data and isinstance(mm_data["videos"], list) - and len(mm_data["videos"]) > 0): - video_grid_thw_lst = [] - pixel_values_videos_lst = [] - for item in mm_data.pop("videos", []): - video_array, metadata = item - - # FIXME(Isotr0py): Activate the below logic after we can disable - # resampling from video loader backend. - # assert metadata["total_num_frames"] == len(video_array), ( - # f"Total frames {metadata['total_num_frames']} does not " - # f"match the length of video array {len(video_array)}.") - - # NOTE: Temporary workaround for resampled videos. - # this can cause a divergence with HF implementation if - # the input video is resampled in advance. - - if metadata["total_num_frames"] != len(video_array): - logger.warning( - "Total frames in metadata " - "(%s) does not match the length of " - "video array %s. This can " - "be because the video is resampled " - "in advance. This may cause " - "a divergence with HF implementation.", - metadata["total_num_frames"], - len(video_array), - ) - metadata["total_num_frames"] = len(video_array) - metadata = VideoMetadata(**metadata) - - video_mm_data = dict() - video_mm_data["videos"] = [[video_array]] - video_mm_data["video_metadata"] = [[metadata]] - - video_outputs = super()._call_hf_processor( - prompt="<|begin_of_video|><|video|><|end_of_video|>", - mm_data=video_mm_data, - mm_kwargs=mm_kwargs, - tok_kwargs=tok_kwargs, - ) - input_ids = video_outputs.pop("input_ids") - input_ids[input_ids == processor.image_token_id] = ( - processor.video_token_id) - video_placeholder = processor.tokenizer.batch_decode( - input_ids)[0] - prompt = prompt.replace( - "<|begin_of_video|><|video|><|end_of_video|>", - video_placeholder, - ) - - video_grid_thw_lst.append(video_outputs["video_grid_thw"]) - pixel_values_videos_lst.append( - video_outputs["pixel_values_videos"]) - video_outputs = dict( - pixel_values_videos=torch.cat(pixel_values_videos_lst), - video_grid_thw=torch.cat(video_grid_thw_lst), - ) - else: - video_outputs = dict() - - processed_outputs = super()._call_hf_processor( - prompt=prompt, - mm_data=mm_data, - mm_kwargs=mm_kwargs, - tok_kwargs=tok_kwargs, - ) - combined_outputs = dict( - processed_outputs, - **video_outputs, - ) - return BatchFeature(combined_outputs) - - def _get_mm_fields_config( - self, - hf_inputs: BatchFeature, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> Mapping[str, MultiModalFieldConfig]: - return _qwen2vl_field_config(hf_inputs) - - def _get_prompt_updates( - self, - mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, Any], - out_mm_kwargs: MultiModalKwargs, - ) -> Sequence[PromptUpdate]: - hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) - image_processor = self.info.get_image_processor( - **hf_processor_mm_kwargs) - tokenizer = self.info.get_tokenizer() - hf_config = self.info.get_hf_config() - - boi_token_id = hf_config.image_start_token_id - eoi_token_id = hf_config.image_end_token_id - - bov_token_id = hf_config.video_start_token_id - eov_token_id = hf_config.video_end_token_id - - merge_length = image_processor.merge_size**2 - - def get_image_replacement_glm4v(item_idx: int): - grid_thw = out_mm_kwargs["image_grid_thw"][item_idx] - assert isinstance(grid_thw, torch.Tensor) - - num_tokens = int(grid_thw.prod()) // merge_length - return [hf_processor.image_token_id] * num_tokens - - def get_video_replacement_glm4v(item_idx: int): - grid_thw = out_mm_kwargs["video_grid_thw"][item_idx] - assert isinstance(grid_thw, torch.Tensor) - - video, metadata = mm_items["video"][item_idx] - timestamps = self.info._get_video_second_idx(metadata, len(video)) - frames_idx_token = [ - tokenizer.encode(str(i), add_special_tokens=False) - for i in timestamps - ] - num_tokens_per_frame = int(grid_thw[1:].prod()) // merge_length - placeholder = [] - placeholder.append(bov_token_id) - for frame_idx in frames_idx_token: - placeholder.append(boi_token_id) - placeholder.extend([hf_processor.video_token_id] * - num_tokens_per_frame) - placeholder.append(eoi_token_id) - placeholder.extend(frame_idx) - placeholder.append(eov_token_id) - return PromptUpdateDetails.select_token_id( - placeholder, - embed_token_id=hf_processor.video_token_id, - ) - - return [ - PromptReplacement( - modality="image", - target=hf_processor.image_token, - replacement=get_image_replacement_glm4v, - ), - PromptReplacement( - modality="video", - target="<|begin_of_video|><|video|><|end_of_video|>", - replacement=get_video_replacement_glm4v, - ), - ] - - -@MULTIMODAL_REGISTRY.register_processor( - Glm4vMultiModalProcessor, - info=Glm4vProcessingInfo, - dummy_inputs=Glm4vDummyInputsBuilder, -) -class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal, - SupportsLoRA, SupportsPP): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": ["gate_up_proj"] - } - - # To ensure correct weight loading and mapping. - hf_to_vllm_mapper = WeightsMapper( - orig_to_new_prefix={ - "lm_head.": "language_model.lm_head.", - "model.language_model.": "language_model.model.", - "model.visual.": "visual.", - }) - - @classmethod - def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: - if modality.startswith("image"): - return "<|begin_of_image|><|image|><|end_of_image|>" - if modality.startswith("video"): - return "<|begin_of_video|><|video|><|end_of_video|>" - - raise ValueError("Only image or video modality is supported") - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - multimodal_config = vllm_config.model_config.multimodal_config - - self.config = config - self.multimodal_config = multimodal_config - - self.visual = Glm4vVisionTransformer( - config.vision_config, - norm_eps=getattr(config, "rms_norm_eps", 1e-5), - quant_config=quant_config, - prefix=maybe_prefix(prefix, "visual"), - ) - - if config.model_type == "glm4v": - architectures = ["Glm4ForCausalLM"] - elif config.model_type == "glm4v_moe": - architectures = ["Glm4MoeForCausalLM"] - else: - architectures = None - - self.language_model = init_vllm_registered_model( - vllm_config=vllm_config, - hf_config=config.text_config, - prefix=maybe_prefix(prefix, "language_model"), - architectures=architectures) - - self.make_empty_intermediate_tensors = ( - self.language_model.make_empty_intermediate_tensors) - - def _validate_and_reshape_mm_tensor(self, mm_input: object, - name: str) -> torch.Tensor: - if not isinstance(mm_input, (torch.Tensor, list)): - raise ValueError( - f"Incorrect type of {name}. Got type: {type(mm_input)}") - if isinstance(mm_input, torch.Tensor): - if mm_input.ndim == 2: - return mm_input - if mm_input.ndim != 3: - raise ValueError(f"{name} should be 2D or batched 3D tensor. " - f"Got ndim: {mm_input.ndim} " - f"(shape={mm_input.shape})") - return torch.concat(list(mm_input)) - else: - return torch.concat(mm_input) - - def _parse_and_validate_image_input( - self, **kwargs: object) -> Optional[Glm4vImageInputs]: - pixel_values = kwargs.pop("pixel_values", None) - image_embeds = kwargs.pop("image_embeds", None) - image_grid_thw = kwargs.pop("image_grid_thw", None) - - if pixel_values is None and image_embeds is None: - return None - - if pixel_values is not None: - pixel_values = self._validate_and_reshape_mm_tensor( - pixel_values, "image pixel values") - image_grid_thw = self._validate_and_reshape_mm_tensor( - image_grid_thw, "image grid_thw") - - return Glm4vImagePixelInputs( - type="pixel_values", - pixel_values=pixel_values, - image_grid_thw=image_grid_thw, - ) - - if image_embeds is not None: - image_embeds = self._validate_and_reshape_mm_tensor( - image_embeds, "image embeds") - image_grid_thw = self._validate_and_reshape_mm_tensor( - image_grid_thw, "image grid_thw") - - return Glm4vImageEmbeddingInputs( - type="image_embeds", - image_embeds=image_embeds, - image_grid_thw=image_grid_thw, - ) - - def _parse_and_validate_video_input( - self, **kwargs: object) -> Optional[Glm4vVideoInputs]: - pixel_values_videos = kwargs.pop("pixel_values_videos", None) - video_embeds = kwargs.pop("video_embeds", None) - video_grid_thw = kwargs.pop("video_grid_thw", None) - - if pixel_values_videos is None and video_embeds is None: - return None - - if pixel_values_videos is not None: - pixel_values_videos = self._validate_and_reshape_mm_tensor( - pixel_values_videos, "video pixel values") - video_grid_thw = self._validate_and_reshape_mm_tensor( - video_grid_thw, "video grid_thw") - - return Glm4vVideoPixelInputs( - type="pixel_values_videos", - pixel_values_videos=pixel_values_videos, - video_grid_thw=video_grid_thw, - ) - - if video_embeds is not None: - video_embeds = self._validate_and_reshape_mm_tensor( - video_embeds, "video embeds") - video_grid_thw = self._validate_and_reshape_mm_tensor( - video_grid_thw, "video grid_thw") - - return Glm4vVideoEmbeddingInputs( - type="video_embeds", - video_embeds=video_embeds, - video_grid_thw=video_grid_thw, - ) - - def _process_image_input( - self, image_input: Glm4vImageInputs) -> tuple[torch.Tensor, ...]: - grid_thw = image_input["image_grid_thw"] - assert grid_thw.ndim == 2 - - if image_input["type"] == "image_embeds": - image_embeds = image_input["image_embeds"].type(self.visual.dtype) - else: - pixel_values = image_input["pixel_values"].type(self.visual.dtype) - image_embeds = self.visual(pixel_values, grid_thw=grid_thw) - - merge_size = self.visual.spatial_merge_size - sizes = grid_thw.prod(-1) // merge_size // merge_size - return image_embeds.split(sizes.tolist()) - - def _process_video_input( - self, video_input: Glm4vVideoInputs) -> tuple[torch.Tensor, ...]: - grid_thw = video_input["video_grid_thw"] - assert grid_thw.ndim == 2 - - device = self.visual.device - flat_grid_thw = torch.cat([ - torch.tensor([[1, h, w]] * t, device=device) - for t, h, w in grid_thw - ]) - if video_input["type"] == "video_embeds": - video_embeds = video_input["video_embeds"].type(self.visual.dtype) - else: - pixel_values_videos = video_input["pixel_values_videos"].type( - self.visual.dtype) - video_embeds = self.visual(pixel_values_videos, - grid_thw=flat_grid_thw) - - # Split concatenated embeddings for each video item. - merge_size = self.visual.spatial_merge_size - sizes = grid_thw.prod(-1) // merge_size // merge_size - - return video_embeds.split(sizes.tolist()) - - def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: - mm_input_by_modality = {} - - # Preserve the order of modalities if there are multiple of them - # from the order of kwargs. - for input_key in kwargs: - if (input_key in ("pixel_values", "image_embeds") - and "image" not in mm_input_by_modality): - mm_input_by_modality["image"] = ( - self._parse_and_validate_image_input(**kwargs)) - if (input_key in ("pixel_values_videos", "video_embeds") - and "video" not in mm_input_by_modality): - mm_input_by_modality["video"] = ( - self._parse_and_validate_video_input(**kwargs)) - return mm_input_by_modality - - def get_language_model(self) -> torch.nn.Module: - return self.language_model - - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: - mm_input_by_modality = self._parse_and_validate_multimodal_inputs( - **kwargs) - if not mm_input_by_modality: - return None - - # The result multimodal_embeddings is tuple of tensors, with each - # tensor correspoending to a multimodal data item (image or video). - multimodal_embeddings: tuple[torch.Tensor, ...] = () - - # NOTE: It is important to iterate over the keys in this dictionary - # to preserve the order of the modalities. - for modality in mm_input_by_modality: - multimodal_input = mm_input_by_modality[modality] - if modality == "image": - vision_embeddings = self._process_image_input(multimodal_input) - multimodal_embeddings += vision_embeddings - if modality == "video": - video_embeddings = self._process_video_input(multimodal_input) - multimodal_embeddings += video_embeddings - return multimodal_embeddings - - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if (multimodal_embeddings is not None - and len(multimodal_embeddings) != 0 - and all(embed.numel() > 0 for embed in multimodal_embeddings)): - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - multimodal_embeddings, - [self.config.image_token_id, self.config.video_token_id], - ) - return inputs_embeds - - def get_input_embeddings_v0( - self, - input_ids: torch.Tensor, - image_input: Optional[Glm4vImageInputs] = None, - video_input: Optional[Glm4vVideoInputs] = None, - ) -> torch.Tensor: - inputs_embeds = self.get_input_embeddings(input_ids) - if image_input is not None: - image_embeds = self._process_image_input(image_input) - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - image_embeds, - placeholder_token_id=self.config.image_token_id, - ) - - if video_input is not None: - video_embeds = self._process_video_input(video_input) - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - video_embeds, - placeholder_token_id=self.config.video_token_id, - ) - return inputs_embeds - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, - **kwargs: object, - ) -> Union[torch.Tensor, IntermediateTensors]: - """Run forward pass for GLM-4V. - - Args: - input_ids: Flattened (concatenated) input_ids corresponding to a - batch. - positions: Flattened (concatenated) position ids corresponding to a - batch. - **NOTE**: If mrope is enabled (default setting for GLM-4V - opensource models), the shape will be `(3, seq_len)`, - otherwise it will be `(seq_len,). - pixel_values: Pixel values to be fed to a model. - `None` if no images are passed. - image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM. - `None` if no images are passed. - pixel_values_videos: Pixel values of videos to be fed to a model. - `None` if no videos are passed. - video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM. - `None` if no videos are passed. - second_per_grid_ts: Tensor `(num_videos)` of video time interval ( - in seconds) for each grid along the temporal dimension in the - 3D position IDs. `None` if no videos are passed. - """ - if intermediate_tensors is not None: - inputs_embeds = None - - # NOTE: In v1, inputs_embeds is always generated at model runner from - # `get_multimodal_embeddings` and `get_input_embeddings`, this - # condition is only for v0 compatibility. - elif inputs_embeds is None: - image_input = self._parse_and_validate_image_input(**kwargs) - video_input = self._parse_and_validate_video_input(**kwargs) - - if image_input is None and video_input is None: - inputs_embeds = None - else: - if uses_mrope(self.config): - assert positions.ndim == 2 and positions.size(0) == 3, ( - "multimodal section rotary embedding requires " - f"(3, seq_len) positions, but got {positions.size()}") - inputs_embeds = self.get_input_embeddings_v0( - input_ids, - image_input=image_input, - video_input=video_input) - input_ids = None - - hidden_states = self.language_model.model( - input_ids=input_ids, - positions=positions, - intermediate_tensors=intermediate_tensors, - inputs_embeds=inputs_embeds, - ) - return hidden_states - - def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) - - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader(self) - return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) - - def get_mm_mapping(self) -> MultiModelKeys: - """ - Get the module prefix in multimodal models - """ - return MultiModelKeys.from_string_field( - language_model="language_model.model", - connector="visual.merger.", - tower_model="visual.", - ) - - -@MULTIMODAL_REGISTRY.register_processor( - Glm4vMultiModalProcessor, - info=Glm4vProcessingInfo, - dummy_inputs=Glm4vDummyInputsBuilder, -) -class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } diff --git a/vllm_kunlun/models/glm4_moe.py b/vllm_kunlun/models/glm4_moe.py deleted file mode 100644 index cf658ae..0000000 --- a/vllm_kunlun/models/glm4_moe.py +++ /dev/null @@ -1,716 +0,0 @@ -# -# Copyright (c) 2025 Baidu, Inc. All Rights Reserved. -# Adapted from vllm/model_executor/models/glm4_moe.py -# Copyright 2023 The vLLM team. -# -# This file is a part of the vllm-kunlun project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Inference-only GLM-4.5 model compatible with HuggingFace weights.""" -import os -import typing -from collections.abc import Callable, Iterable -from itertools import islice -from typing import Any, Optional, Union - -import torch -from torch import nn -from transformers.models.glm4_moe import Glm4MoeConfig - -from vllm_kunlun.ops.attention.layer import Attention -from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config -from vllm.distributed import (get_ep_group, get_pp_group,get_dp_group,get_tp_group, - get_tensor_model_parallel_world_size) -from vllm.logger import init_logger -from vllm_kunlun.ops.activation import SiluAndMul -from vllm_kunlun.ops.fused_moe.layer import FusedMoE -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, - QKVParallelLinear, - RowParallelLinear) -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.vocab_parallel_embedding import ( - ParallelLMHead, VocabParallelEmbedding) -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors - -from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP -from vllm.model_executor.models.utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers, - maybe_prefix) -from vllm_kunlun.ops.rotary_embedding import Split_Norm_Rope - -logger = init_logger(__name__) - - -class Glm4MoeMLP(nn.Module): - - def __init__( - self, - hidden_size: int, - intermediate_size: int, - hidden_act: str, - quant_config: Optional[QuantizationConfig] = None, - reduce_results: bool = True, - prefix: str = "", - ) -> None: - super().__init__() - self.gate_up_proj = MergedColumnParallelLinear( - hidden_size, [intermediate_size] * 2, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.gate_up_proj") - self.down_proj = RowParallelLinear(intermediate_size, - hidden_size, - bias=False, - quant_config=quant_config, - reduce_results=reduce_results, - prefix=f"{prefix}.down_proj") - if hidden_act != "silu": - raise ValueError(f"Unsupported activation: {hidden_act}. " - "Only silu is supported for now.") - self.act_fn = SiluAndMul() - - def forward(self, x): - gate_up, _ = self.gate_up_proj(x) - x = self.act_fn(gate_up) - x, _ = self.down_proj(x) - return x - -class Glm4MoE(nn.Module): - - def __init__( - self, - config: Glm4MoeConfig, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - enable_eplb: bool = False, - ): - super().__init__() - self.tp_size = get_tensor_model_parallel_world_size() - self.routed_scaling_factor = config.routed_scaling_factor - - self.ep_group = get_ep_group().device_group - self.ep_rank = self.ep_group.rank() - self.ep_size = self.ep_group.size() - self.n_routed_experts: int = config.n_routed_experts - self.n_shared_experts: int = config.n_shared_experts - - if config.hidden_act != "silu": - raise ValueError(f"Unsupported activation: {config.hidden_act}. " - "Only silu is supported for now.") - # NOTE In the transformers implementation, the gate isn't an nn.Linear, - # so we cannot use ReplicatedLinear here. - # See: https://github.com/huggingface/transformers/blob/v4.55.1/src/transformers/models/glm4_moe/modeling_glm4_moe.py#L260 - self.gate = nn.Linear( - config.hidden_size, - config.n_routed_experts, - bias=False, - dtype=torch.float32, - ) - self.gate.e_score_correction_bias = nn.Parameter( - torch.empty(config.n_routed_experts, dtype=torch.float32)) - - # Load balancing settings. - vllm_config = get_current_vllm_config() - parallel_config = vllm_config.parallel_config - self.enable_eplb = enable_eplb - - self.n_redundant_experts = parallel_config.num_redundant_experts - self.n_logical_experts = self.n_routed_experts - self.n_physical_experts = (self.n_logical_experts + - self.n_redundant_experts) - self.n_local_physical_experts = self.n_physical_experts // self.ep_size - - self.physical_expert_start = (self.ep_rank * - self.n_local_physical_experts) - self.physical_expert_end = (self.physical_expert_start + - self.n_local_physical_experts) - - self.experts = FusedMoE( - num_experts=config.n_routed_experts, - top_k=config.num_experts_per_tok, - hidden_size=config.hidden_size, - intermediate_size=config.moe_intermediate_size, - reduce_results=False, - renormalize=config.norm_topk_prob, - quant_config=quant_config, - use_grouped_topk=True, - num_expert_group=config.n_group, - topk_group=config.topk_group, - prefix=f"{prefix}.experts", - scoring_func="sigmoid", - e_score_correction_bias=self.gate.e_score_correction_bias, - enable_eplb=self.enable_eplb, - num_redundant_experts=self.n_redundant_experts) - - if config.n_shared_experts is not None: - intermediate_size = (config.moe_intermediate_size * - config.n_shared_experts) - self.shared_experts = Glm4MoeMLP( - hidden_size=config.hidden_size, - intermediate_size=intermediate_size, - hidden_act=config.hidden_act, - quant_config=quant_config, - reduce_results=self.experts.must_reduce_shared_expert_outputs( - ), - prefix=f"{prefix}.shared_experts", - ) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - num_tokens, hidden_dim = hidden_states.shape - hidden_states = hidden_states.view(-1, hidden_dim) - - if self.n_shared_experts is not None: - shared_output = self.shared_experts(hidden_states) - else: - shared_output = None - - router_logits = self.gate(hidden_states.to(dtype=torch.float32)) - kunlun_linear_weights = self.gate.weight - final_hidden_states = self.experts( - hidden_states=hidden_states, - router_logits=router_logits, - linear_weights=kunlun_linear_weights) * self.routed_scaling_factor - if shared_output is not None: - final_hidden_states = final_hidden_states + shared_output - if self.tp_size > 1: - final_hidden_states = ( - self.experts.maybe_all_reduce_tensor_model_parallel( - final_hidden_states)) - return final_hidden_states.view(num_tokens, hidden_dim) - - -class Glm4MoeAttention(nn.Module): - - def __init__( - self, - config: Glm4MoeConfig, - hidden_size: int, - num_heads: int, - num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: Optional[dict[str, Any]] = None, - max_position_embeddings: int = 131072, - head_dim: Optional[int] = None, - rms_norm_eps: float = 1e-05, - qkv_bias: bool = False, - use_qk_norm: bool = False, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: - super().__init__() - self.hidden_size = hidden_size - tp_size = get_tensor_model_parallel_world_size() - self.total_num_heads = num_heads - assert self.total_num_heads % tp_size == 0 - self.num_heads = self.total_num_heads // tp_size - self.total_num_kv_heads = num_kv_heads - if self.total_num_kv_heads >= tp_size: - # Number of KV heads is greater than TP size, so we partition - # the KV heads across multiple tensor parallel GPUs. - assert self.total_num_kv_heads % tp_size == 0 - else: - # Number of KV heads is less than TP size, so we replicate - # the KV heads across multiple tensor parallel GPUs. - assert tp_size % self.total_num_kv_heads == 0 - self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) - self.head_dim = head_dim or (hidden_size // self.total_num_heads) - self.q_size = self.num_heads * self.head_dim - self.kv_size = self.num_kv_heads * self.head_dim - self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta - self.max_position_embeddings = max_position_embeddings - self.use_qk_norm = use_qk_norm - - self.qkv_proj = QKVParallelLinear(hidden_size, - self.head_dim, - self.total_num_heads, - self.total_num_kv_heads, - bias=qkv_bias, - quant_config=quant_config, - prefix=f"{prefix}.qkv_proj") - - self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim, - hidden_size, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.o_proj") - - self.partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5) - self.rotary_emb = get_rope( - self.head_dim, - rotary_dim=self.head_dim, - max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, - partial_rotary_factor=self.partial_rotary_factor, - ) - self.attn = Attention( - self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.attn", - ) - - if self.use_qk_norm: - self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps) - self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - ) -> torch.Tensor: - qkv, _ = self.qkv_proj(hidden_states) - - if os.getenv('USE_ORI_ROPE') == "1" or not self.use_qk_norm: - q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - if self.use_qk_norm: - q = self.q_norm(q.reshape(-1, self.num_heads, - self.head_dim)).reshape(q.shape) - k = self.k_norm(k.reshape(-1, self.num_kv_heads, - self.head_dim)).reshape(k.shape) - q, k = self.rotary_emb(positions, q, k) - else: - # Rope fusion operators - q, k, v = Split_Norm_Rope(qkv, - self.rotary_emb.cos_sin_cache, - self.q_norm.weight, - self.k_norm.weight, - positions, - self.max_position_embeddings, - self.num_heads, - self.num_kv_heads, - self.head_dim, - partial_rotary_factor=self.partial_rotary_factor, - ) - - attn_output = self.attn(q, k, v) - output, _ = self.o_proj(attn_output) - return output - - -class Glm4MoeDecoderLayer(nn.Module): - - def __init__( - self, - config: Glm4MoeConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - enable_eplb: bool = False, - ) -> None: - super().__init__() - self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - max_position_embeddings = getattr(config, "max_position_embeddings", - 131072) - # DecoderLayers are created with `make_layers` which passes the prefix - # with the layer's index. - layer_idx = int(prefix.split(sep='.')[-1]) - self.layer_idx = layer_idx - - self.self_attn = Glm4MoeAttention( - config=config, - hidden_size=self.hidden_size, - num_heads=config.num_attention_heads, - num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, - max_position_embeddings=max_position_embeddings, - head_dim=config.head_dim, - rms_norm_eps=config.rms_norm_eps, - qkv_bias=config.attention_bias, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.self_attn", - use_qk_norm=config.use_qk_norm, - ) - - if (config.n_routed_experts is not None - and layer_idx >= config.first_k_dense_replace): - self.mlp = Glm4MoE( - config=config, - quant_config=quant_config, - prefix=f"{prefix}.mlp", - enable_eplb=enable_eplb, - ) - else: - self.mlp = Glm4MoeMLP(hidden_size=config.hidden_size, - intermediate_size=config.intermediate_size, - hidden_act=config.hidden_act, - quant_config=quant_config, - prefix=f"{prefix}.mlp") - - self.input_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - self.post_attention_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - self.routed_scaling_factor = config.routed_scaling_factor - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - residual: Optional[torch.Tensor], - ) -> tuple[torch.Tensor, torch.Tensor]: - if residual is None: - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - else: - hidden_states, residual = self.input_layernorm( - hidden_states, residual) - hidden_states = self.self_attn(positions=positions, - hidden_states=hidden_states) - hidden_states, residual = self.post_attention_layernorm( - hidden_states, residual) - hidden_states = self.mlp(hidden_states) - return hidden_states, residual - - -@support_torch_compile( - dynamic_arg_dims={ - "input_ids": 0, - "positions": -1, - "intermediate_tensors": 0, - "inputs_embeds": 0, - }) -class Glm4MoeModel(nn.Module): - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - - config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - enable_eplb = vllm_config.parallel_config.enable_eplb - self.config = config - - self.vocab_size = config.vocab_size - - if get_pp_group().is_first_rank: - self.embed_tokens = VocabParallelEmbedding( - config.vocab_size, - config.hidden_size, - prefix=f"{prefix}.embed_tokens") - else: - self.embed_tokens = PPMissingLayer() - - self.start_layer, self.end_layer, self.layers = make_layers( - config.num_hidden_layers, - lambda prefix: Glm4MoeDecoderLayer( - config=config, - cache_config=cache_config, - quant_config=quant_config, - prefix=prefix, - enable_eplb=enable_eplb, - ), - prefix=f"{prefix}.layers") - - if get_pp_group().is_last_rank: - self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - else: - self.norm = PPMissingLayer() - self.make_empty_intermediate_tensors = ( - make_empty_intermediate_tensors_factory( - ["hidden_states", "residual"], config.hidden_size)) - - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.embed_tokens(input_ids) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, IntermediateTensors]: - if get_pp_group().is_first_rank: - if inputs_embeds is not None: - hidden_states = inputs_embeds - else: - hidden_states = self.get_input_embeddings(input_ids) - residual = None - else: - assert intermediate_tensors is not None - hidden_states = intermediate_tensors["hidden_states"] - residual = intermediate_tensors["residual"] - - for i in range(self.start_layer, self.end_layer): - layer = self.layers[i] - hidden_states, residual = layer(positions, hidden_states, residual) - - if not get_pp_group().is_last_rank: - return IntermediateTensors({ - "hidden_states": hidden_states, - "residual": residual - }) - - hidden_states, _ = self.norm(hidden_states, residual) - return hidden_states - - def make_empty_intermediate_tensors( - self, batch_size: int, dtype: torch.dtype, - device: torch.device) -> IntermediateTensors: - return IntermediateTensors({ - "hidden_states": - torch.zeros((batch_size, self.config.hidden_size), - dtype=dtype, - device=device), - "residual": - torch.zeros((batch_size, self.config.hidden_size), - dtype=dtype, - device=device), - }) - - def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: - # Params for weights, fp8 weight scales, fp8 activation scales - # (param_name, weight_name, expert_id, shard_id) - return FusedMoE.make_expert_params_mapping( - ckpt_gate_proj_name="gate_proj", - ckpt_down_proj_name="down_proj", - ckpt_up_proj_name="up_proj", - num_experts=self.config.n_routed_experts) - - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - expert_params_mapping = self.get_expert_mapping() - for name, loaded_weight in weights: - spec_layer = get_spec_layer_idx_from_weight_name(self.config, name) - if spec_layer is not None: - continue - for (param_name, weight_name, shard_id) in stacked_params_mapping: - # Skip non-stacked layers and experts (experts handled below). - if weight_name not in name: - continue - # We have mlp.experts[0].gate_proj in the checkpoint. - # Since we handle the experts below in expert_params_mapping, - # we need to skip here BEFORE we update the name, otherwise - # name will be updated to mlp.experts[0].gate_up_proj, which - # will then be updated below in expert_params_mapping - # for mlp.experts[0].gate_gate_up_proj, which breaks load. - if (("mlp.experts." in name) and name not in params_dict): - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - is_expert_weight = False - for mapping in expert_params_mapping: - param_name, weight_name, expert_id, shard_id = mapping - if weight_name not in name: - continue - - # Anyway, this is an expert weight and should not be - # attempted to load as other weights later - is_expert_weight = True - - # Do not modify `name` since the loop may continue here - # Instead, create a new variable - name_mapped = name.replace(weight_name, param_name) - - if is_pp_missing_parameter(name_mapped, self): - continue - - param = params_dict[name_mapped] - # We should ask the weight loader to return success or not - # here since otherwise we may skip experts with other - # available replicas. - weight_loader = typing.cast(Callable[..., bool], - param.weight_loader) - success = weight_loader(param, - loaded_weight, - name_mapped, - shard_id=shard_id, - expert_id=expert_id, - return_success=True) - if success: - name = name_mapped - break - else: - if is_expert_weight: - # We've checked that this is an expert weight - # However it's not mapped locally to this rank - # So we simply skip it - continue - - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - - # Remapping the name of FP8 kv-scale. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - - return loaded_params - - -class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } - - fall_back_to_pt_during_load = False - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - self.config = config - self.quant_config = quant_config - self.model = Glm4MoeModel(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model")) - if get_pp_group().is_last_rank: - self.lm_head = ParallelLMHead(config.vocab_size, - config.hidden_size, - quant_config=quant_config) - else: - self.lm_head = PPMissingLayer() - self.logits_processor = LogitsProcessor(config.vocab_size) - self.make_empty_intermediate_tensors = ( - self.model.make_empty_intermediate_tensors) - self.expert_weights = [] - - # Set MoE hyperparameters - self.num_moe_layers = (config.num_hidden_layers - - config.first_k_dense_replace) - self.num_expert_groups = config.n_group - - self.moe_layers: list[FusedMoE] = [] - example_moe = None - for layer in self.model.layers: - if isinstance(layer, PPMissingLayer): - continue - - assert isinstance(layer, Glm4MoeDecoderLayer) - if isinstance(layer.mlp, Glm4MoE): - # Pick last one layer since the first ones may be dense layers. - example_moe = layer.mlp - self.moe_layers.append(layer.mlp.experts) - - if example_moe is None: - raise RuntimeError("No Glm4MoE layer found in model.layers.") - - self.num_logical_experts = example_moe.n_logical_experts - self.num_physical_experts = example_moe.n_physical_experts - self.num_local_physical_experts = example_moe.n_local_physical_experts - self.num_routed_experts = example_moe.n_routed_experts - self.num_shared_experts = example_moe.n_shared_experts - self.num_redundant_experts = example_moe.n_redundant_experts - - def set_eplb_state( - self, - expert_load_view: torch.Tensor, - logical_to_physical_map: torch.Tensor, - logical_replica_count: torch.Tensor, - ) -> None: - for layer_idx, layer in enumerate(self.moe_layers): - # Register the expert weights. - self.expert_weights.append(layer.get_expert_weights()) - layer.set_eplb_state( - moe_layer_idx=layer_idx, - expert_load_view=expert_load_view, - logical_to_physical_map=logical_to_physical_map, - logical_replica_count=logical_replica_count, - ) - - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.model.get_input_embeddings(input_ids) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, IntermediateTensors]: - hidden_states = self.model(input_ids, positions, intermediate_tensors, - inputs_embeds) - return hidden_states - - def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) - return logits - - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader(self) - return loader.load_weights(weights) - - def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: - return self.model.get_expert_mapping() - - -def get_spec_layer_idx_from_weight_name(config: Glm4MoeConfig, - weight_name: str) -> Optional[int]: - if hasattr(config, - "num_nextn_predict_layers") and (config.num_nextn_predict_layers - > 0): - layer_idx = config.num_hidden_layers - for i in range(config.num_nextn_predict_layers): - if f"layers.{layer_idx+i}." in weight_name: - return layer_idx + i - return None \ No newline at end of file diff --git a/vllm_kunlun/models/gpt_oss.py b/vllm_kunlun/models/gpt_oss.py index cc18587..2f5d9dd 100644 --- a/vllm_kunlun/models/gpt_oss.py +++ b/vllm_kunlun/models/gpt_oss.py @@ -1,21 +1,5 @@ -# -# Copyright (c) 2025 Baidu, Inc. All Rights Reserved. -# Adapted from vllm/model_executor/models/gpt_oss.py -# Copyright 2023 The vLLM team. -# -# This file is a part of the vllm-kunlun project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from typing import Optional diff --git a/vllm_kunlun/models/interns1.py b/vllm_kunlun/models/interns1.py index 8fd23d8..76f5c2f 100644 --- a/vllm_kunlun/models/interns1.py +++ b/vllm_kunlun/models/interns1.py @@ -1,21 +1,11 @@ -# -# Copyright (c) 2025 Baidu, Inc. All Rights Reserved. -# Adapted from vllm/model_executor/models/interns1.py -# Copyright 2023 The vLLM team. -# -# This file is a part of the vllm-kunlun project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# -------------------------------------------------------- +# InternS1 +# Copyright (c) 2025 Shanghai AI Lab +# Licensed under The MIT License [see LICENSE for details] +# -------------------------------------------------------- from collections.abc import Iterable, Mapping, Sequence from typing import Literal, Optional, TypedDict, Union @@ -258,33 +248,39 @@ class InternS1DummyInputsBuilder(BaseDummyInputsBuilder[InternS1ProcessingInfo] return image_token * num_images + video_token * num_videos +# def get_dummy_mm_data( +# self, +# seq_len: int, +# mm_counts: Mapping[str, int], +# ) -> MultiModalDataDict: +# target_width, target_height = \ +# self.info.get_image_size_with_most_features() +# target_num_frames = \ +# self.info.get_num_frames_with_most_features(seq_len, mm_counts) +# num_images = mm_counts.get("image", 0) +# num_videos = mm_counts.get("video", 0) + +# config = self.info.get_hf_config() +# image_size_h, image_size_w = config.vision_config.image_size + +# return { +# "image": +# self._get_dummy_images(width=target_width, +# height=target_height, +# num_images=num_images), +# "video": +# self._get_dummy_videos(width=image_size_w, +# height=image_size_h, +# num_frames=target_num_frames, +# num_videos=num_videos), +# } def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], ) -> MultiModalDataDict: - """Generates dummy multimodal data on Kunlun3 platform for performance analysis and warmup. - - Retrieves visual resolution based on configuration (defaulting to 224x224) - and generates resized dummy data for images and videos. - - Args: - seq_len: Sequence length (unused). - mm_counts: A mapping of multimodal type counts, containing "image" - and "video" keys. - - Returns: - MultiModalDataDict: A dictionary containing the generated dummy image - and video data, structured as: - { - "image": dummy_image_data, - "video": dummy_video_data - } - - Author: - Dong Xinyu - """ + # 读取配置里的视觉分辨率;若缺省则兜底 224×224 config = self.info.get_hf_config() img_size = getattr(config.vision_config, "image_size", None) if isinstance(img_size, (tuple, list)) and len(img_size) == 2: @@ -292,13 +288,15 @@ class InternS1DummyInputsBuilder(BaseDummyInputsBuilder[InternS1ProcessingInfo] else: cfg_h, cfg_w = 224, 224 + # 统一缩减:不再使用 “with_most_features”,而是选择较小的安全尺寸 target_width = min(cfg_w, 224) target_height = min(cfg_h, 224) - target_num_frames = 1 + target_num_frames = 1 # profile/warmup 只造 1 帧即可 num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) + # 统一让视频也按缩减后的分辨率生成 return { "image": self._get_dummy_images( width=target_width, diff --git a/vllm_kunlun/models/interns1_vit.py b/vllm_kunlun/models/interns1_vit.py index bfdc6f9..28c6bc2 100644 --- a/vllm_kunlun/models/interns1_vit.py +++ b/vllm_kunlun/models/interns1_vit.py @@ -1,21 +1,12 @@ -# -# Copyright (c) 2025 Baidu, Inc. All Rights Reserved. -# Adapted from vllm/model_executor/models/interns1_vit.py -# Copyright 2023 The vLLM team. -# -# This file is a part of the vllm-kunlun project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py +# -------------------------------------------------------- +# InternVL +# Copyright (c) 2023 OpenGVLab +# Licensed under The MIT License [see LICENSE for details] +# -------------------------------------------------------- from collections.abc import Iterable from typing import Optional @@ -26,6 +17,7 @@ from transformers import PretrainedConfig from transformers.utils import torch_int from vllm.model_executor.layers.activation import get_act_fn +# from vllm_kunlun.ops.activation import GeluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) @@ -253,6 +245,7 @@ class InternS1VisionMLP(nn.Module): self.config = config self.activation_fn = get_act_fn(config.hidden_act) + # self.activation_fn = GeluAndMul() self.fc1 = ColumnParallelLinear(config.hidden_size, config.intermediate_size, bias=True, diff --git a/vllm_kunlun/models/internvl.py b/vllm_kunlun/models/internvl.py index 81e186b..6ec825c 100644 --- a/vllm_kunlun/models/internvl.py +++ b/vllm_kunlun/models/internvl.py @@ -1,21 +1,12 @@ -# -# Copyright (c) 2025 Baidu, Inc. All Rights Reserved. -# Adapted from vllm/model_executor/models/internvl.py -# Copyright 2023 The vLLM team. -# -# This file is a part of the vllm-kunlun project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py +# -------------------------------------------------------- +# InternVL +# Copyright (c) 2023 OpenGVLab +# Licensed under The MIT License [see LICENSE for details] +# -------------------------------------------------------- from abc import ABC, abstractmethod from collections.abc import Iterable, Mapping, Sequence from typing import Annotated, Any, Literal, Optional, TypeVar, Union diff --git a/vllm_kunlun/models/llama.py b/vllm_kunlun/models/llama.py index 9fabba7..222b962 100644 --- a/vllm_kunlun/models/llama.py +++ b/vllm_kunlun/models/llama.py @@ -1,9 +1,15 @@ -# -# Copyright (c) 2025 Baidu, Inc. All Rights Reserved. -# Adapted from vllm/model_executor/models/llama.py +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. # -# This file is a part of the vllm-kunlun project. +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -38,8 +44,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( - DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead) -from vllm_kunlun.ops.vocab_parallel_embedding import VocabParallelEmbedding + DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata diff --git a/vllm_kunlun/models/qwen2.py b/vllm_kunlun/models/qwen2.py index c95b90d..21e56be 100644 --- a/vllm_kunlun/models/qwen2.py +++ b/vllm_kunlun/models/qwen2.py @@ -1,9 +1,16 @@ -# -# Copyright (c) 2025 Baidu, Inc. All Rights Reserved. -# Adapted from vllm/model_executor/models/qwen2.py +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py +# Copyright 2024 The Qwen team. # Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. # -# This file is a part of the vllm-kunlun project. +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -33,7 +40,7 @@ from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm_kunlun.ops.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, +from vllm_kunlun.ops.linear import (MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor @@ -44,7 +51,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm_kunlun.ops.vocab_parallel_embedding import VocabParallelEmbedding from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata +# from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.model_executor.models.adapters import as_seq_cls_model @@ -177,7 +184,12 @@ class Qwen2Attention(nn.Module): ) -> torch.Tensor: qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - q, k = self.rotary_emb(positions, q, k) + # INTERNVL_3暂时使用环境变量来控制是否使用原生rotary_embedding + # 若要修改,可尝试参考 qwen3.py + if os.getenv('INTERNVL_3') == "1": + q, k = self.rotary_emb.forward_native(positions, q, k) + else: + q, k = self.rotary_emb(positions, q, k) attn_output = self.attn(q, k, v) output, _ = self.o_proj(attn_output) return output @@ -295,6 +307,7 @@ class Qwen2Model(nn.Module): )) self.config = config + config = config.get_text_config() self.quant_config = quant_config self.vocab_size = config.vocab_size @@ -479,10 +492,10 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, + # sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states,) + # sampling_metadata) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm_kunlun/models/qwen2_5_vl.py b/vllm_kunlun/models/qwen2_5_vl.py index 7c0da7d..1fa7d2d 100644 --- a/vllm_kunlun/models/qwen2_5_vl.py +++ b/vllm_kunlun/models/qwen2_5_vl.py @@ -1,9 +1,17 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Adapted from +# https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +# Copyright 2025 The vLLM team. +# Copyright 2025 The Qwen Team. +# Copyright 2025 The HuggingFace Inc. team. +# All rights reserved. # -# Copyright (c) 2025 Baidu, Inc. All Rights Reserved. -# Adapted from vllm/model_executor/models/qwen2_5_vl.py -# Copyright 2023 The vLLM team. -# -# This file is a part of the vllm-kunlun project. +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -24,26 +32,22 @@ from typing import Callable, Literal, Optional, TypedDict, Union import torch import torch.nn as nn import torch.nn.functional as F -from einops import rearrange, repeat +from einops import rearrange from transformers import BatchFeature from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import ( - Qwen2_5_VLConfig, - Qwen2_5_VLVisionConfig, -) + Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig) from vllm.config import VllmConfig from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils from vllm.logger import init_logger -from vllm.model_executor import SamplingMetadata +# from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import ( - ColumnParallelLinear, - QKVParallelLinear, - RowParallelLinear, -) +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -54,27 +58,14 @@ from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope -from vllm.model_executor.models.interfaces import ( - MultiModalEmbeddings, - SupportsLoRA, - SupportsMultiModal, - SupportsPP, - SupportsQuant, -) +from vllm.model_executor.models.interfaces import (MultiModalEmbeddings, SupportsLoRA, + SupportsMultiModal, SupportsPP, SupportsQuant) from .qwen2_vl import Qwen2VLDummyInputsBuilder as Qwen2_5_VLDummyInputsBuilder -from .qwen2_vl import ( - Qwen2VLMultiModalProcessor, - Qwen2VLProcessingInfo, - apply_rotary_pos_emb_vision, -) -from vllm.model_executor.models.utils import ( - AutoWeightsLoader, - WeightsMapper, - cast_overflow_tensors, - init_vllm_registered_model, - maybe_prefix, - merge_multimodal_embeddings, -) +from .qwen2_vl import (Qwen2VLMultiModalProcessor, Qwen2VLProcessingInfo, + apply_rotary_pos_emb_vision) +from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper, cast_overflow_tensors, + init_vllm_registered_model, maybe_prefix, + merge_multimodal_embeddings) from vllm.model_executor.models.vision import get_vit_attn_backend logger = init_logger(__name__) @@ -116,9 +107,8 @@ class Qwen2_5_VLImageEmbeddingInputs(TypedDict): """ -Qwen2_5_VLImageInputs = Union[ - Qwen2_5_VLImagePixelInputs, Qwen2_5_VLImageEmbeddingInputs -] +Qwen2_5_VLImageInputs = Union[Qwen2_5_VLImagePixelInputs, + Qwen2_5_VLImageEmbeddingInputs] class Qwen2_5_VLVideoPixelInputs(TypedDict): @@ -163,46 +153,37 @@ class Qwen2_5_VLVideoEmbeddingInputs(TypedDict): """ -Qwen2_5_VLVideoInputs = Union[ - Qwen2_5_VLVideoPixelInputs, Qwen2_5_VLVideoEmbeddingInputs -] +Qwen2_5_VLVideoInputs = Union[Qwen2_5_VLVideoPixelInputs, + Qwen2_5_VLVideoEmbeddingInputs] # === Vision Encoder === # class Qwen2_5_VisionMLP(nn.Module): - def __init__( - self, - in_features: int, - hidden_features: int, - bias: bool = False, - act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): + def __init__(self, + in_features: int, + hidden_features: int, + bias: bool = False, + act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): super().__init__() - self.gate_proj = ColumnParallelLinear( - in_features, - hidden_features, - bias=bias, - quant_config=quant_config, - prefix=f"{prefix}.gate_proj", - ) - self.up_proj = ColumnParallelLinear( - in_features, - hidden_features, - bias=bias, - quant_config=quant_config, - prefix=f"{prefix}.up_proj", - ) - self.down_proj = RowParallelLinear( - hidden_features, - in_features, - bias=bias, - quant_config=quant_config, - prefix=f"{prefix}.down_proj", - ) + self.gate_proj = ColumnParallelLinear(in_features, + hidden_features, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.gate_proj") + self.up_proj = ColumnParallelLinear(in_features, + hidden_features, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.up_proj") + self.down_proj = RowParallelLinear(hidden_features, + in_features, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.down_proj") self.act_fn = act_fn def forward(self, x: torch.Tensor): @@ -216,14 +197,14 @@ class Qwen2_5_VisionMLP(nn.Module): def all_gather_interleave(local_tensor, hidden_size: int, tp_size: int): """All-gather the input tensor interleavely across model parallel group.""" import torch.distributed as dist - gathered_tensors = [torch.zeros_like(local_tensor) for _ in range(tp_size)] - dist.all_gather( - gathered_tensors, local_tensor, group=parallel_state.get_tp_group().device_group - ) + dist.all_gather(gathered_tensors, + local_tensor, + group=parallel_state.get_tp_group().device_group) gathered_tensors_split = [ - torch.split(tensor, hidden_size // tp_size, -1) for tensor in gathered_tensors + torch.split(tensor, hidden_size // tp_size, -1) + for tensor in gathered_tensors ] ordered_tensors = [ tensor for pair in zip(*gathered_tensors_split) for tensor in pair @@ -233,8 +214,8 @@ def all_gather_interleave(local_tensor, hidden_size: int, tp_size: int): class Qwen2_5_VisionAttention(nn.Module): - """ """ - + """ + """ def __init__( self, embed_dim: int, @@ -242,17 +223,20 @@ class Qwen2_5_VisionAttention(nn.Module): projection_size: int, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + use_data_parallel: bool = False, + attn_backend: _Backend = _Backend.TORCH_SDPA, + use_upstream_fa: bool = False ) -> None: """ Initializes the Qwen2.5-VL module. - + Args: embed_dim (int): The embedding dimension for the input data. num_heads (int): The number of attention heads. projection_size (int): The size of the projection layer. quant_config (Optional[QuantizationConfig], optional): The quantization configuration. Defaults to None. prefix (str, optional): The prefix string for parameter names. Defaults to "". - + Raises: RuntimeError: If the attn backend is not supported. """ @@ -261,11 +245,9 @@ class Qwen2_5_VisionAttention(nn.Module): self.tp_size = parallel_state.get_tensor_model_parallel_world_size() self.tp_rank = parallel_state.get_tensor_model_parallel_rank() self.hidden_size_per_attention_head = dist_utils.divide( - projection_size, num_heads - ) + projection_size, num_heads) self.num_attention_heads_per_partition = dist_utils.divide( - num_heads, self.tp_size - ) + num_heads, self.tp_size) self.qkv = QKVParallelLinear( hidden_size=embed_dim, @@ -274,21 +256,16 @@ class Qwen2_5_VisionAttention(nn.Module): total_num_kv_heads=num_heads, bias=True, quant_config=quant_config, - prefix=f"{prefix}.qkv", - ) - self.proj = RowParallelLinear( - input_size=projection_size, - output_size=embed_dim, - quant_config=quant_config, - prefix=f"{prefix}.proj", - ) + prefix=f"{prefix}.qkv") + self.proj = RowParallelLinear(input_size=projection_size, + output_size=embed_dim, + quant_config=quant_config, + prefix=f"{prefix}.proj") # Detect attention implementation. self.attn_backend = _Backend.FLASH_ATTN if self.attn_backend not in { - _Backend.FLASH_ATTN, - _Backend.TORCH_SDPA, - _Backend.XFORMERS, + _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS }: raise RuntimeError( f"Qwen2.5-VL does not support {self.attn_backend} backend now." @@ -298,37 +275,33 @@ class Qwen2_5_VisionAttention(nn.Module): # [s, b, 3 * head * head_dim] seq_len, bs, _ = qkv.shape if self.tp_size > 1: - qkv = all_gather_interleave(qkv, self.qkv.hidden_size, self.tp_size) + qkv = all_gather_interleave(qkv, self.qkv.hidden_size, + self.tp_size) # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim] q, k, v = qkv.chunk(3, dim=2) # 3 * [s, b, head * head_dim] if self.tp_size > 1: - splitter = partial( - dist_utils.split_tensor_along_last_dim, num_partitions=self.tp_size - ) + splitter = partial(dist_utils.split_tensor_along_last_dim, + num_partitions=self.tp_size) q = splitter(q)[self.tp_rank] k = splitter(k)[self.tp_rank] v = splitter(v)[self.tp_rank] # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim] - new_shape = ( - seq_len, - bs, - self.num_attention_heads_per_partition, - self.hidden_size_per_attention_head, - ) + new_shape = (seq_len, bs, self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head) q, k, v = (x.view(*new_shape) for x in (q, k, v)) return q, k, v def forward( - self, - x: torch.Tensor, - cu_seqlens: torch.Tensor, - rotary_pos_emb: torch.Tensor, - max_seqlen: Optional[int] = None, # Only used for Flash Attention - seqlens: Optional[list[int]] = None, # Only used for xFormers + self, + x: torch.Tensor, + cu_seqlens: torch.Tensor, + rotary_pos_emb: torch.Tensor, + max_seqlen: Optional[int] = None, # Only used for Flash Attention + seqlens: Optional[list[int]] = None, # Only used for xFormers ) -> torch.Tensor: # [s, b, c] --> [s, b, head * 3 * head_dim] x, _ = self.qkv(x) @@ -337,7 +310,8 @@ class Qwen2_5_VisionAttention(nn.Module): q, k, v = self.split_qkv(x) batch_size = q.shape[1] - q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v)) + q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() + for x in (q, k, v)) if rotary_pos_emb is not None: q = apply_rotary_pos_emb_vision(q, rotary_pos_emb) k = apply_rotary_pos_emb_vision(k, rotary_pos_emb) @@ -349,19 +323,19 @@ class Qwen2_5_VisionAttention(nn.Module): q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) - output = flash_attn_varlen_func( - q, - k, - v, - cu_seqlens_q=cu_seqlens, - cu_seqlens_k=cu_seqlens, - max_seqlen_q=max_seqlen, - max_seqlen_k=max_seqlen, - dropout_p=0, - causal=False, - ) + output = flash_attn_varlen_func(q, + k, + v, + cu_seqlens_q=cu_seqlens, + cu_seqlens_k=cu_seqlens, + max_seqlen_q=max_seqlen, + max_seqlen_k=max_seqlen, + dropout_p=0, + causal=False) - context_layer = rearrange(output, "(b s) ... -> b s ...", b=batch_size) + context_layer = rearrange(output, + "(b s) ... -> b s ...", + b=batch_size) elif self.attn_backend == _Backend.TORCH_SDPA: # Execute attention entry by entry for speed & less VRAM. outputs = [] @@ -371,10 +345,12 @@ class Qwen2_5_VisionAttention(nn.Module): q_i = q[:, start_idx:end_idx] k_i = k[:, start_idx:end_idx] v_i = v[:, start_idx:end_idx] - q_i, k_i, v_i = ( - rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i] - ) - output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0) + q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d") + for x in [q_i, k_i, v_i]) + output_i = F.scaled_dot_product_attention(q_i, + k_i, + v_i, + dropout_p=0.0) output_i = rearrange(output_i, "b h s d -> b s h d ") outputs.append(output_i) context_layer = torch.cat(outputs, dim=1) @@ -382,14 +358,14 @@ class Qwen2_5_VisionAttention(nn.Module): from xformers import ops as xops from xformers.ops.fmha.attn_bias import BlockDiagonalMask - attn_bias = BlockDiagonalMask.from_seqlens( - q_seqlen=seqlens, kv_seqlen=None, device=q.device - ) + attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens, + kv_seqlen=None, + device=q.device) context_layer = xops.memory_efficient_attention_forward( - q, k, v, attn_bias=attn_bias, p=0, scale=None - ) - context_layer = rearrange(context_layer, "b s h d -> s b (h d)").contiguous() + q, k, v, attn_bias=attn_bias, p=0, scale=None) + context_layer = rearrange(context_layer, + "b s h d -> s b (h d)").contiguous() output, _ = self.proj(context_layer) return output @@ -412,37 +388,31 @@ class Qwen2_5_VisionBlock(nn.Module): norm_layer = partial(nn.LayerNorm, eps=1e-6) self.norm1 = norm_layer(dim) self.norm2 = norm_layer(dim) - self.attn = Qwen2_5_VisionAttention( - embed_dim=dim, - num_heads=num_heads, - projection_size=dim, - quant_config=quant_config, - prefix=f"{prefix}.attn", - ) - self.mlp = Qwen2_5_VisionMLP( - dim, - mlp_hidden_dim, - act_fn=act_fn, - bias=True, - quant_config=quant_config, - prefix=f"{prefix}.mlp", - ) + self.attn = Qwen2_5_VisionAttention(embed_dim=dim, + num_heads=num_heads, + projection_size=dim, + quant_config=quant_config, + prefix=f"{prefix}.attn") + self.mlp = Qwen2_5_VisionMLP(dim, + mlp_hidden_dim, + act_fn=act_fn, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.mlp") def forward( - self, - x: torch.Tensor, - cu_seqlens: torch.Tensor, - rotary_pos_emb: torch.Tensor, - max_seqlen: Optional[int] = None, # Only used for Flash Attention - seqlens: Optional[list[int]] = None, # Only used for xFormers + self, + x: torch.Tensor, + cu_seqlens: torch.Tensor, + rotary_pos_emb: torch.Tensor, + max_seqlen: Optional[int] = None, # Only used for Flash Attention + seqlens: Optional[list[int]] = None, # Only used for xFormers ) -> torch.Tensor: - x = x + self.attn( - self.norm1(x), - cu_seqlens=cu_seqlens, - rotary_pos_emb=rotary_pos_emb, - max_seqlen=max_seqlen, - seqlens=seqlens, - ) + x = x + self.attn(self.norm1(x), + cu_seqlens=cu_seqlens, + rotary_pos_emb=rotary_pos_emb, + max_seqlen=max_seqlen, + seqlens=seqlens) x = x + self.mlp(self.norm2(x)) return x @@ -463,17 +433,16 @@ class Qwen2_5_VisionPatchEmbed(nn.Module): self.hidden_size = hidden_size kernel_size = (temporal_patch_size, patch_size, patch_size) - self.proj = nn.Conv3d( - in_channels, - hidden_size, - kernel_size=kernel_size, - stride=kernel_size, - bias=False, - ) + self.proj = nn.Conv3d(in_channels, + hidden_size, + kernel_size=kernel_size, + stride=kernel_size, + bias=False) def forward(self, x: torch.Tensor) -> torch.Tensor: L, C = x.shape - x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size) + x = x.view(L, -1, self.temporal_patch_size, self.patch_size, + self.patch_size) x = self.proj(x).view(L, self.hidden_size) return x @@ -494,25 +463,19 @@ class Qwen2_5_VisionPatchMerger(nn.Module): if norm_layer is None: norm_layer = partial(nn.LayerNorm, eps=1e-6) self.ln_q = norm_layer(context_dim) - self.mlp = nn.ModuleList( - [ - ColumnParallelLinear( - self.hidden_size, - self.hidden_size, - bias=True, - quant_config=quant_config, - prefix=f"{prefix}.mlp.0", - ), - nn.GELU(), - RowParallelLinear( - self.hidden_size, - d_model, - bias=True, - quant_config=quant_config, - prefix=f"{prefix}.mlp.2", - ), - ] - ) + self.mlp = nn.ModuleList([ + ColumnParallelLinear(self.hidden_size, + self.hidden_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.mlp.0"), + nn.GELU(), + RowParallelLinear(self.hidden_size, + d_model, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.mlp.2"), + ]) def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.ln_q(x) @@ -529,19 +492,20 @@ class Qwen2_5_VisionRotaryEmbedding(nn.Module): def __init__(self, dim: int, theta: float = 10000.0) -> None: """ - Initialization function to create an instance. - + 初始化函数,用于创建一个实例。 + Args: - dim (int): Dimension size, representing the length of the input feature vector. - theta (float, optional, default=10000.0): Parameter that controls the smoothness of the frequency distribution, default is 10000.0. - + dim (int): 维度大小,表示输入的特征向量长度。 + theta (float, optional, default=10000.0): 参数,控制频谱分布的平滑程度,默认为10000.0。 + Returns: - None: Does not return any value, directly creates an instance. + None: 不返回任何值,直接创建一个实例。 """ super().__init__() self.dim = dim self.theta = theta - inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim)) + inv_freq = 1.0 / (theta**( + torch.arange(0, dim, 2, dtype=torch.float) / dim)) self.register_buffer("inv_freq", inv_freq, persistent=False) self._seq_len_cached = 0 self._freqs_cached = None @@ -550,18 +514,12 @@ class Qwen2_5_VisionRotaryEmbedding(nn.Module): if seqlen > self._seq_len_cached: seqlen *= 2 self._seq_len_cached = seqlen - self.inv_freq = 1.0 / ( - self.theta - ** ( - torch.arange( - 0, self.dim, 2, dtype=torch.float, device=self.inv_freq.device - ) - / self.dim - ) - ) - seq = torch.arange( - seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype - ) + self.inv_freq = 1.0 / (self.theta**(torch.arange( + 0, self.dim, 2, dtype=torch.float, device=self.inv_freq.device) + / self.dim)) + seq = torch.arange(seqlen, + device=self.inv_freq.device, + dtype=self.inv_freq.dtype) freqs = torch.outer(seq, self.inv_freq) self._freqs_cached = freqs @@ -580,14 +538,14 @@ class Qwen2_5_VisionTransformer(nn.Module): prefix: str = "", ) -> None: """ - Args: - vision_config (Qwen2_5_VLVisionConfig): config of the Vision Transformer model. - norm_eps (float, optional, default=1e-6): Epsilon added to the norm computation. - Defaults to 1e-6. - quant_config (Optional[QuantizationConfig], optional): Config for post-training quantization. - Defaults to None. - prefix (str, optional): Prefix string for module names. - Defaults to "". + Args: + vision_config (Qwen2_5_VLVisionConfig): config of the Vision Transformer model. + norm_eps (float, optional, default=1e-6): Epsilon added to the norm computation. + Defaults to 1e-6. + quant_config (Optional[QuantizationConfig], optional): Config for post-training quantization. + Defaults to None. + prefix (str, optional): Prefix string for module names. + Defaults to "". """ super().__init__() @@ -616,20 +574,17 @@ class Qwen2_5_VisionTransformer(nn.Module): head_dim = self.hidden_size // self.num_heads self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2) - self.blocks = nn.ModuleList( - [ - Qwen2_5_VisionBlock( - dim=self.hidden_size, - num_heads=self.num_heads, - mlp_hidden_dim=vision_config.intermediate_size, - act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act], - norm_layer=norm_layer, - quant_config=quant_config, - prefix=f"{prefix}.blocks.{layer_idx}", - ) - for layer_idx in range(depth) - ] - ) + self.blocks = nn.ModuleList([ + Qwen2_5_VisionBlock( + dim=self.hidden_size, + num_heads=self.num_heads, + mlp_hidden_dim=vision_config.intermediate_size, + act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act], + norm_layer=norm_layer, + quant_config=quant_config, + prefix=f"{prefix}.blocks.{layer_idx}") + for layer_idx in range(depth) + ]) self.merger = Qwen2_5_VisionPatchMerger( d_model=vision_config.out_hidden_size, context_dim=self.hidden_size, @@ -651,66 +606,48 @@ class Qwen2_5_VisionTransformer(nn.Module): def rotary_pos_emb_thw(self, t, h, w): hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) - hpos_ids = ( - hpos_ids.reshape( - h // self.spatial_merge_size, - self.spatial_merge_size, - w // self.spatial_merge_size, - self.spatial_merge_size, - ) - .permute(0, 2, 1, 3) - .flatten() - ) - wpos_ids = ( - wpos_ids.reshape( - h // self.spatial_merge_size, - self.spatial_merge_size, - w // self.spatial_merge_size, - self.spatial_merge_size, - ) - .permute(0, 2, 1, 3) - .flatten() - ) + hpos_ids = hpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ).permute(0, 2, 1, 3).flatten() + wpos_ids = wpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ).permute(0, 2, 1, 3).flatten() pos_ids = torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1) max_size = max(h, w) rotary_pos_emb_full = self.rotary_pos_emb(max_size) rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) rotary_pos_emb = rotary_pos_emb.reshape( rotary_pos_emb.shape[0] // self.spatial_merge_unit, - self.spatial_merge_unit, - -1, - ) + self.spatial_merge_unit, -1) return rotary_pos_emb def get_window_index_thw(self, grid_t, grid_h, grid_w): - vit_merger_window_size = ( - self.window_size // self.spatial_merge_size // self.patch_size - ) + vit_merger_window_size = (self.window_size // + self.spatial_merge_size // self.patch_size) llm_grid_h = grid_h // self.spatial_merge_size llm_grid_w = grid_w // self.spatial_merge_size index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape( - grid_t, llm_grid_h, llm_grid_w - ) + grid_t, llm_grid_h, llm_grid_w) pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size - index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100) - index_padded = index_padded.reshape( - grid_t, - num_windows_h, - vit_merger_window_size, - num_windows_w, - vit_merger_window_size, - ) + index_padded = F.pad(index, (0, pad_w, 0, pad_h), 'constant', -100) + index_padded = index_padded.reshape(grid_t, num_windows_h, + vit_merger_window_size, + num_windows_w, + vit_merger_window_size) index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape( - grid_t, - num_windows_h * num_windows_w, - vit_merger_window_size, - vit_merger_window_size, - ) + grid_t, num_windows_h * num_windows_w, vit_merger_window_size, + vit_merger_window_size) seqlens = (index_padded != -100).sum([2, 3]).reshape(-1) index_padded = index_padded.reshape(-1) index_new = index_padded[index_padded != -100] @@ -722,19 +659,15 @@ class Qwen2_5_VisionTransformer(nn.Module): @lru_cache(maxsize=1024) # noqa: B019 def get_rope_by_thw(self, t, h, w): - window_index_thw, cu_seqlens_window_thw = self.get_window_index_thw(t, h, w) + window_index_thw, cu_seqlens_window_thw = self.get_window_index_thw( + t, h, w) rotary_pos_emb_thw = self.rotary_pos_emb_thw(t, h, w) rotary_pos_emb_thw = rotary_pos_emb_thw[window_index_thw, :, :] rotary_pos_emb_thw = rotary_pos_emb_thw.flatten(start_dim=0, end_dim=1) cu_seqlens_thw = torch.repeat_interleave( - torch.tensor([h * w], dtype=torch.int32), t - ) - return ( - rotary_pos_emb_thw, - window_index_thw, - cu_seqlens_window_thw, - cu_seqlens_thw, - ) + torch.tensor([h * w], dtype=torch.int32), t) + return (rotary_pos_emb_thw, window_index_thw, cu_seqlens_window_thw, + cu_seqlens_thw) def compute_attn_mask_seqlen( self, @@ -777,9 +710,10 @@ class Qwen2_5_VisionTransformer(nn.Module): ) = self.get_rope_by_thw(t, h, w) window_index.append(window_index_thw + window_index_id) - window_index_id += t * llm_h * llm_w + window_index_id += (t * llm_h * llm_w) - cu_seqlens_window_thw = cu_seqlens_window_thw + cu_window_seqlens_last + cu_seqlens_window_thw = (cu_seqlens_window_thw + + cu_window_seqlens_last) cu_window_seqlens_last = cu_seqlens_window_thw[-1] cu_window_seqlens.append(cu_seqlens_window_thw) @@ -797,36 +731,21 @@ class Qwen2_5_VisionTransformer(nn.Module): # transformers # pre-compute seqlens for window/full attn to reduce cuMemcpy operations - max_seqlen_full, seqlens_full = self.compute_attn_mask_seqlen(cu_seqlens) + max_seqlen_full, seqlens_full = self.compute_attn_mask_seqlen( + cu_seqlens) max_seqlen_window, seqlens_window = self.compute_attn_mask_seqlen( - cu_window_seqlens - ) + cu_window_seqlens) cu_seqlens = cu_seqlens.to(device=self.device, non_blocking=True) - cu_window_seqlens = cu_window_seqlens.to(device=self.device, non_blocking=True) - rotary_pos_emb = rotary_pos_emb.to(device=self.device, non_blocking=True) - - # Construct rotary_pos_emb_cos_sin_cache to use rope_vit operator - rotary_pos_emb_cos = rotary_pos_emb.cos() - rotary_pos_emb_sin = rotary_pos_emb.sin() - interleaved = False - rotary_pos_emb_cos = repeat( - rotary_pos_emb_cos, - "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)", - ) # shape: [seq_len, 1, head_dim] - rotary_pos_emb_sin = repeat( - rotary_pos_emb_sin, - "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)", - ) # shape: [seq_len, 1, head_dim] - rotary_pos_emb_cos_sin_cache = torch.cat( - [rotary_pos_emb_cos, rotary_pos_emb_sin], dim=1 - ) # shape: [seq_len, 2, head_dim] - - window_index = window_index.to(device=hidden_states.device, non_blocking=True) + cu_window_seqlens = cu_window_seqlens.to(device=self.device, + non_blocking=True) + rotary_pos_emb = rotary_pos_emb.to(device=self.device, + non_blocking=True) + window_index = window_index.to(device=hidden_states.device, + non_blocking=True) hidden_states = hidden_states.reshape( - seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1 - ) + seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1) hidden_states = hidden_states[window_index, :, :] hidden_states = hidden_states.reshape(seq_len, -1) @@ -845,7 +764,7 @@ class Qwen2_5_VisionTransformer(nn.Module): hidden_states = blk( hidden_states, cu_seqlens=cu_seqlens_now, - rotary_pos_emb=rotary_pos_emb_cos_sin_cache, + rotary_pos_emb=rotary_pos_emb, max_seqlen=max_seqlen_now, seqlens=seqlens_now, ) @@ -861,7 +780,8 @@ class Qwen2_5_VisionTransformer(nn.Module): hidden_states = hidden_states[reverse_indices, :] return hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("attn.qkv.", "attn.q.", "q"), @@ -872,7 +792,7 @@ class Qwen2_5_VisionTransformer(nn.Module): loaded_params: set[str] = set() for name, loaded_weight in weights: - for param_name, weight_name, shard_id in stacked_params_mapping: + for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue name = name.replace(weight_name, param_name) @@ -883,7 +803,8 @@ class Qwen2_5_VisionTransformer(nn.Module): break else: param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader = getattr(param, "weight_loader", + default_weight_loader) weight_loader(param, loaded_weight) loaded_params.add(name) return loaded_params @@ -908,12 +829,11 @@ class Qwen2_5_VLProcessingInfo(Qwen2VLProcessingInfo): return self.ctx.get_hf_processor( Qwen2_5_VLProcessor, - image_processor=self.get_image_processor( - min_pixels=min_pixels, - max_pixels=max_pixels, - size=size, - use_fast=kwargs.get("use_fast", True), - ), + image_processor=self.get_image_processor(min_pixels=min_pixels, + max_pixels=max_pixels, + size=size, + use_fast=kwargs.get( + "use_fast", True)), **kwargs, ) @@ -934,11 +854,10 @@ class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor): @MULTIMODAL_REGISTRY.register_processor( Qwen2_5_VLMultiModalProcessor, info=Qwen2_5_VLProcessingInfo, - dummy_inputs=Qwen2_5_VLDummyInputsBuilder, -) -class Qwen2_5_VLForConditionalGeneration( - nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsQuant -): + dummy_inputs=Qwen2_5_VLDummyInputsBuilder) +class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, + SupportsLoRA, SupportsPP, + SupportsQuant): # To ensure correct weight loading and mapping. hf_to_vllm_mapper = WeightsMapper( @@ -949,8 +868,7 @@ class Qwen2_5_VLForConditionalGeneration( # mapping for original checkpoint "lm_head.": "language_model.lm_head.", "model.": "language_model.model.", - } - ) + }) @classmethod def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: @@ -983,37 +901,31 @@ class Qwen2_5_VLForConditionalGeneration( ) self.make_empty_intermediate_tensors = ( - self.language_model.make_empty_intermediate_tensors - ) + self.language_model.make_empty_intermediate_tensors) def _maybe_ignore_quant_config(self, config: Optional[QuantizationConfig]): # GPTQ configs do not have a list of ignored modules, however AutoGPTQ # seems to avoid vision encoder sections for some models. return config - def _validate_and_reshape_mm_tensor( - self, mm_input: object, name: str - ) -> torch.Tensor: + def _validate_and_reshape_mm_tensor(self, mm_input: object, + name: str) -> torch.Tensor: if not isinstance(mm_input, (torch.Tensor, list)): - raise ValueError( - f"Incorrect type of {name}. " f"Got type: {type(mm_input)}" - ) + raise ValueError(f"Incorrect type of {name}. " + f"Got type: {type(mm_input)}") if isinstance(mm_input, torch.Tensor): if mm_input.ndim == 2: return mm_input if mm_input.ndim != 3: - raise ValueError( - f"{name} should be 2D or batched 3D tensor. " - f"Got ndim: {mm_input.ndim} " - f"(shape={mm_input.shape})" - ) + raise ValueError(f"{name} should be 2D or batched 3D tensor. " + f"Got ndim: {mm_input.ndim} " + f"(shape={mm_input.shape})") return torch.concat(list(mm_input)) else: return torch.concat(mm_input) def _parse_and_validate_image_input( - self, **kwargs: object - ) -> Optional[Qwen2_5_VLImageInputs]: + self, **kwargs: object) -> Optional[Qwen2_5_VLImageInputs]: pixel_values = kwargs.pop("pixel_values", None) image_embeds = kwargs.pop("image_embeds", None) image_grid_thw = kwargs.pop("image_grid_thw", None) @@ -1023,46 +935,34 @@ class Qwen2_5_VLForConditionalGeneration( if pixel_values is not None: pixel_values = self._validate_and_reshape_mm_tensor( - pixel_values, "image pixel values" - ) + pixel_values, "image pixel values") image_grid_thw = self._validate_and_reshape_mm_tensor( - image_grid_thw, "image grid_thw" - ) + image_grid_thw, "image grid_thw") if not isinstance(pixel_values, (torch.Tensor, list)): - raise ValueError( - "Incorrect type of image pixel values. " - f"Got type: {type(pixel_values)}" - ) + raise ValueError("Incorrect type of image pixel values. " + f"Got type: {type(pixel_values)}") - return Qwen2_5_VLImagePixelInputs( - type="pixel_values", - pixel_values=pixel_values, - image_grid_thw=image_grid_thw, - ) + return Qwen2_5_VLImagePixelInputs(type="pixel_values", + pixel_values=pixel_values, + image_grid_thw=image_grid_thw) if image_embeds is not None: image_embeds = self._validate_and_reshape_mm_tensor( - image_embeds, "image embeds" - ) + image_embeds, "image embeds") image_grid_thw = self._validate_and_reshape_mm_tensor( - image_grid_thw, "image grid_thw" - ) + image_grid_thw, "image grid_thw") if not isinstance(image_embeds, torch.Tensor): - raise ValueError( - "Incorrect type of image embeddings. " - f"Got type: {type(image_embeds)}" - ) + raise ValueError("Incorrect type of image embeddings. " + f"Got type: {type(image_embeds)}") return Qwen2_5_VLImageEmbeddingInputs( type="image_embeds", image_embeds=image_embeds, - image_grid_thw=image_grid_thw, - ) + image_grid_thw=image_grid_thw) def _parse_and_validate_video_input( - self, **kwargs: object - ) -> Optional[Qwen2_5_VLVideoInputs]: + self, **kwargs: object) -> Optional[Qwen2_5_VLVideoInputs]: pixel_values_videos = kwargs.pop("pixel_values_videos", None) video_embeds = kwargs.pop("video_embeds", None) video_grid_thw = kwargs.pop("video_grid_thw", None) @@ -1073,11 +973,9 @@ class Qwen2_5_VLForConditionalGeneration( if pixel_values_videos is not None: pixel_values_videos = self._validate_and_reshape_mm_tensor( - pixel_values_videos, "video pixel values" - ) + pixel_values_videos, "video pixel values") video_grid_thw = self._validate_and_reshape_mm_tensor( - video_grid_thw, "video grid_thw" - ) + video_grid_thw, "video grid_thw") return Qwen2_5_VLVideoPixelInputs( type="pixel_values_videos", @@ -1088,41 +986,36 @@ class Qwen2_5_VLForConditionalGeneration( if video_embeds is not None: video_embeds = self._validate_and_reshape_mm_tensor( - video_embeds, "video embeds" - ) + video_embeds, "video embeds") video_grid_thw = self._validate_and_reshape_mm_tensor( - video_grid_thw, "video grid_thw" - ) + video_grid_thw, "video grid_thw") if not isinstance(video_embeds, torch.Tensor): - raise ValueError( - "Incorrect type of video embeddings. " - f"Got type: {type(video_embeds)}" - ) + raise ValueError("Incorrect type of video embeddings. " + f"Got type: {type(video_embeds)}") return Qwen2_5_VLVideoEmbeddingInputs( type="video_embeds", video_embeds=video_embeds, - video_grid_thw=video_grid_thw, - ) + video_grid_thw=video_grid_thw) def _process_image_input( - self, image_input: Qwen2_5_VLImageInputs - ) -> tuple[torch.Tensor, ...]: + self, + image_input: Qwen2_5_VLImageInputs) -> tuple[torch.Tensor, ...]: """ - Process image input and return tensors for each image item. - If the input is image embeddings, return the image embeddings; otherwise, return the tensors processed by the visual model. - + 处理图像输入,返回每个图像项的张量。 + 如果输入是图像嵌入,则返回图像嵌入;否则返回经过视觉模型处理后的张量。 + Args: - image_input (Qwen2_5_VLImageInputs): A dictionary containing image information, including the following key-value pairs: - - type (str, optional): The type of the image, which can be "image_embeds" or None (default), indicating the use of image embeddings; - If None, the "pixel_values" and "image_grid_thw" key-value pairs must be provided. - - pixel_values (torch.Tensor, optional): The pixel values of the image, with a shape of (batch_size, num_channels, height, width), - dtype is float32, optional, and must be provided only when type is None. - - image_grid_thw (torch.Tensor, optional): The image grid size, with a shape of (batch_size, 2) and dtype int64, - optional, and must be provided only when type is None. - + image_input (Qwen2_5_VLImageInputs): 包含图像信息的字典,其中包括以下键值对: + - type (str, optional): 图像类型,可选值为"image_embeds"或者None(默认),表示使用图像嵌入; + None时,需要提供"pixel_values"和"image_grid_thw"键值对。 + - pixel_values (torch.Tensor, optional): 图像像素值,shape为(batch_size, num_channels, height, width), + dtype为float32,optional,只有当type为None时才需要提供。 + - image_grid_thw (torch.Tensor, optional): 图像网格大小,shape为(batch_size, 2),dtype为int64, + optional,只有当type为None时才需要提供。 + Returns: - tuple (torch.Tensor, ...): A tuple containing the tensors for each image item, with the number of tensors equal to the vertical product of the image grid size. + tuple (torch.Tensor, ...): 一个元组,包含每个图像项的张量,张量数量等于图像网格大小的纵向乘积。 """ grid_thw = image_input["image_grid_thw"] @@ -1137,28 +1030,26 @@ class Qwen2_5_VLForConditionalGeneration( # Split concatenated embeddings for each image item. merge_size = self.visual.spatial_merge_size - sizes = (grid_thw[:, 0] * grid_thw[:, 1] * grid_thw[:, 2]) // ( - merge_size * merge_size - ) + sizes = (grid_thw[:, 0] * grid_thw[:, 1] * grid_thw[:, 2]) // (merge_size * merge_size) return image_embeds.split(sizes.tolist()) def _process_video_input( - self, video_input: Qwen2_5_VLVideoInputs - ) -> tuple[torch.Tensor, ...]: + self, + video_input: Qwen2_5_VLVideoInputs) -> tuple[torch.Tensor, ...]: """ - Process video input and return tensors for each video item. - If the input is video embeddings, return the video embeddings; otherwise, return the tensors processed by the visual model. - + 处理视频输入,返回每个视频项的张量。 + 如果是视频嵌入,则直接返回;否则,使用视觉模型提取视频嵌入。 + Args: - video_input (Qwen2_5_VLVideoInputs): A dictionary containing video information, including the following key-value pairs: - - type (str): The type of the video, which can be "video_embeds" or "pixel_values_videos". - - video_grid_thw (torch.Tensor): The video grid size, with a shape of (N, 2), where N is the number of video items, and the last two dimensions represent height and width. - - video_embeds (Optional[torch.Tensor]): The video embeddings, which are only valid when type is "video_embeds". - - pixel_values_videos (Optional[torch.Tensor]): The pixel values of the videos, which are only valid when type is "pixel_values_videos". - + video_input (Qwen2_5_VLVideoInputs): 包含视频数据的字典,包括: + - type (str): 类型,可选值为"video_embeds"或"pixel_values_videos"。 + - video_grid_thw (torch.Tensor): 视频网格大小,形状为(N, 2),其中N是视频项数,第一维度表示视频项索引,最后两个维度分别代表高和宽。 + - video_embeds (Optional[torch.Tensor]): 视频嵌入,仅当type为"video_embeds"时有效。 + - pixel_values_videos (Optional[torch.Tensor]): 像素值视频,仅当type为"pixel_values_videos"时有效。 + Returns: - tuple[torch.Tensor, ...]: A tuple containing the tensors for each video item, with the number of tensors equal to N. + tuple[torch.Tensor, ...]: 元组,包含每个视频项的张量,长度为N。 """ grid_thw = video_input["video_grid_thw"] @@ -1169,11 +1060,12 @@ class Qwen2_5_VLForConditionalGeneration( video_embeds = video_input["video_embeds"] else: pixel_values_videos = video_input["pixel_values_videos"] - video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw_list) + video_embeds = self.visual(pixel_values_videos, + grid_thw=grid_thw_list) # Split concatenated embeddings for each video item. merge_size = self.visual.spatial_merge_size - sizes = grid_thw.prod(grid_thw.dim() - 1) // merge_size // merge_size + sizes = grid_thw.prod(grid_thw.dim() - 1) // merge_size // merge_size return video_embeds.split(sizes.tolist()) @@ -1183,28 +1075,24 @@ class Qwen2_5_VLForConditionalGeneration( # Preserve the order of modalities if there are multiple of them # from the order of kwargs. for input_key in kwargs: - if ( - input_key in ("pixel_values", "image_embeds") - and "image" not in mm_input_by_modality - ): - mm_input_by_modality["image"] = self._parse_and_validate_image_input( - **kwargs - ) - if ( - input_key in ("pixel_values_videos", "video_embeds") - and "video" not in mm_input_by_modality - ): - mm_input_by_modality["video"] = self._parse_and_validate_video_input( - **kwargs - ) + if input_key in ("pixel_values", "image_embeds" + ) and "image" not in mm_input_by_modality: + mm_input_by_modality[ + "image"] = self._parse_and_validate_image_input(**kwargs) + if input_key in ("pixel_values_videos", "video_embeds" + ) and "video" not in mm_input_by_modality: + mm_input_by_modality[ + "video"] = self._parse_and_validate_video_input(**kwargs) return mm_input_by_modality def get_language_model(self) -> torch.nn.Module: return self.language_model - def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: - mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs) + mm_input_by_modality = self._parse_and_validate_multimodal_inputs( + **kwargs) if not mm_input_by_modality: return [] @@ -1230,13 +1118,11 @@ class Qwen2_5_VLForConditionalGeneration( multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None and len(multimodal_embeddings) != 0: + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - multimodal_embeddings, - [self.config.image_token_id, self.config.video_token_id], - ) + input_ids, inputs_embeds, multimodal_embeddings, + [self.config.image_token_id, self.config.video_token_id]) return inputs_embeds def get_input_embeddings_v0( @@ -1271,7 +1157,7 @@ class Qwen2_5_VLForConditionalGeneration( positions: torch.Tensor, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, - kv_caches: list[torch.Tensor] = None, + kv_caches: list[torch.Tensor]= None, **kwargs: object, ) -> Union[torch.Tensor, IntermediateTensors]: """Run forward pass for Qwen2.5-VL. @@ -1313,11 +1199,11 @@ class Qwen2_5_VLForConditionalGeneration( if uses_mrope(self.config): assert positions.ndim == 2 and positions.size(0) == 3, ( "multimodal section rotary embedding requires " - f"(3, seq_len) positions, but got {positions.size()}" - ) + f"(3, seq_len) positions, but got {positions.size()}") inputs_embeds = self.get_input_embeddings_v0( - input_ids, image_input=image_input, video_input=video_input - ) + input_ids, + image_input=image_input, + video_input=video_input) input_ids = None hidden_states = self.language_model.model( @@ -1331,11 +1217,13 @@ class Qwen2_5_VLForConditionalGeneration( def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, + # sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, sampling_metadata) + return self.language_model.compute_logits(hidden_states) + # sampling_metadata) - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) @@ -1348,4 +1236,4 @@ class Qwen2_5_VLForConditionalGeneration( language_model="language_model", connector="visual.merger.", tower_model="visual.", - ) + ) \ No newline at end of file diff --git a/vllm_kunlun/models/qwen2_vl.py b/vllm_kunlun/models/qwen2_vl.py index 6402f55..18f854c 100644 --- a/vllm_kunlun/models/qwen2_vl.py +++ b/vllm_kunlun/models/qwen2_vl.py @@ -1,9 +1,16 @@ -# -# Copyright (c) 2025 Baidu, Inc. All Rights Reserved. -# Adapted from vllm/model_executor/models/qwen2vl.py +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Adapted from +# https://github.com/huggingface/transformers/blob/19e6e80e10118f855137b90740936c0b11ac397f/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py +# Copyright 2024 The Qwen team. # Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. # -# This file is a part of the vllm-kunlun project. +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -38,7 +45,7 @@ from vllm.config import VllmConfig from vllm.distributed import parallel_state, tensor_model_parallel_all_gather from vllm.distributed import utils as dist_utils from vllm.logger import init_logger -from vllm.model_executor import SamplingMetadata +# from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.activation import QuickGELU from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) @@ -70,11 +77,12 @@ from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) from vllm.model_executor.models.vision import get_vit_attn_backend +import xspeedgate_ops logger = init_logger(__name__) # For profile run -_MAX_FRAMES_PER_VIDEO = 16 +_MAX_FRAMES_PER_VIDEO = 14 # === Vision Inputs === # @@ -226,13 +234,10 @@ def apply_rotary_emb_torch(x: torch.Tensor, def apply_rotary_pos_emb_vision(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor: t_ = t.float() - + if freqs.dim() == 3 and freqs.shape[1] == 2: - # freqs: (seq_len, 2, head_dim) - # Call custom XPU Kernel version - import xspeedgate_ops return torch.ops.xspeedgate_ops.rope_vit(t_, freqs, interleaved = False).type_as(t) - + cos = freqs.cos() sin = freqs.sin() apply_rotary_emb = apply_rotary_emb_torch @@ -922,10 +927,10 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): image_processor=None, ) - def _get_max_video_frames(self, max_tokens: int) -> int: + def _get_max_video_frames(self, max_tokens: int, start_num_frames: int = 1) -> int: target_width, target_height = self.get_image_size_with_most_features() - num_frames = 0 + num_frames = start_num_frames while True: next_num_frames = num_frames + 1 @@ -947,15 +952,23 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): self, seq_len: int, mm_counts: Mapping[str, int], + max_frames_per_video: int = _MAX_FRAMES_PER_VIDEO, ) -> int: - max_images = mm_counts.get("image", 0) + # max_images = mm_counts.get("image", 0) + # max_videos = mm_counts.get("video", 0) + + # max_image_tokens = self.get_max_image_tokens() * max_images + # max_total_frames = self._get_max_video_frames(seq_len - + # max_image_tokens) + # max_frames_per_video = min(max_total_frames // max(max_videos, 1), + # _MAX_FRAMES_PER_VIDEO) + + # return max(max_frames_per_video, 1) max_videos = mm_counts.get("video", 0) - max_image_tokens = self.get_max_image_tokens() * max_images - max_total_frames = self._get_max_video_frames(seq_len - - max_image_tokens) + max_total_frames = self._get_max_video_frames(seq_len) max_frames_per_video = min(max_total_frames // max(max_videos, 1), - _MAX_FRAMES_PER_VIDEO) + max_frames_per_video) return max(max_frames_per_video, 1) @@ -1404,10 +1417,10 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, + # sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) + # sampling_metadata) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: @@ -1507,4 +1520,4 @@ class Tarsier2ForConditionalGeneration(Qwen2VLForConditionalGeneration): torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) - return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) \ No newline at end of file diff --git a/vllm_kunlun/models/qwen3.py b/vllm_kunlun/models/qwen3.py index 308c05a..7a21ad5 100644 --- a/vllm_kunlun/models/qwen3.py +++ b/vllm_kunlun/models/qwen3.py @@ -1,9 +1,14 @@ -# -# Copyright (c) 2025 Baidu, Inc. All Rights Reserved. -# Adapted from vllm/model_executor/models/qwen3.py +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright 2024 The Qwen team. # Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. # -# This file is a part of the vllm-kunlun project. +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,65 +23,57 @@ # limitations under the License. """Inference-only Qwen3 model compatible with HuggingFace weights.""" from collections.abc import Iterable -from typing import Optional, Union -import xtorch_ops +from typing import Any, Optional, Union + import torch -import os from torch import nn from transformers import Qwen3Config -from vllm.attention import AttentionType, AttentionMetadata +from vllm.attention import AttentionType from vllm_kunlun.ops.attention.layer import Attention + from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import (QKVParallelLinear, + +from vllm_kunlun.ops.linear import (QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead -from vllm_kunlun.ops.vocab_parallel_embedding import VocabParallelEmbedding -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from vllm import envs - -from vllm.model_executor.models.adapters import as_seq_cls_model -from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP +from vllm.model_executor.models.interfaces import SupportsEagle3, SupportsLoRA, SupportsPP from .qwen2 import Qwen2MLP as Qwen3MLP +from .qwen2 import Qwen2Model from vllm.model_executor.models.utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index, - is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) -from vllm.forward_context import ForwardContext, get_forward_context -from vllm.platforms import current_platform -from vllm_kunlun.ops.rotary_embedding import Split_Norm_Rope - logger = init_logger(__name__) class Qwen3Attention(nn.Module): - def __init__(self, - hidden_size: int, - num_heads: int, - num_kv_heads: int, - max_position: int = 4096 * 32, - head_dim: Optional[int] = None, - rms_norm_eps: float = 1e-06, - qkv_bias: bool = False, - rope_theta: float = 10000, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - rope_scaling: Optional[tuple] = None, - prefix: str = "", - attn_type: str = AttentionType.DECODER) -> None: + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + max_position: int = 4096 * 32, + head_dim: Optional[int] = None, + rms_norm_eps: float = 1e-06, + qkv_bias: bool = False, + rope_theta: float = 10000, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + rope_scaling: Optional[tuple] = None, + prefix: str = "", + attn_type: str = AttentionType.DECODER, + dual_chunk_attention_config: Optional[dict[str, Any]] = None, + ) -> None: super().__init__() self.hidden_size = hidden_size tp_size = get_tensor_model_parallel_world_size() @@ -98,10 +95,7 @@ class Qwen3Attention(nn.Module): self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 self.rope_theta = rope_theta - self.max_position = max_position - if rope_scaling is not None: - scaling_factor = rope_scaling["factor"] - self.max_position = int(self.max_position * scaling_factor) + self.dual_chunk_attention_config = dual_chunk_attention_config self.qkv_proj = QKVParallelLinear( hidden_size, @@ -123,18 +117,25 @@ class Qwen3Attention(nn.Module): self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.head_dim, - max_position=self.max_position, + max_position=max_position, base=self.rope_theta, rope_scaling=rope_scaling, + dual_chunk_attention_config=dual_chunk_attention_config, + ) + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn", + attn_type=attn_type, + **{ + "layer_idx": extract_layer_index(prefix), + "dual_chunk_attention_config": dual_chunk_attention_config, + } if dual_chunk_attention_config else {}, ) - self.attn = Attention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.attn", - attn_type=attn_type) self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps) self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps) @@ -142,35 +143,19 @@ class Qwen3Attention(nn.Module): self, positions: torch.Tensor, hidden_states: torch.Tensor, - attn_metadata: AttentionMetadata, - residual: Optional[torch.Tensor], ) -> torch.Tensor: qkv, _ = self.qkv_proj(hidden_states) - # TODO: Supports both original Rope and Kunlun Rope fusion operators - if os.getenv('FUSED_QK_ROPE_OP') == "1": - # Rope fusion operators - q, k, v = Split_Norm_Rope(qkv, - self.rotary_emb.cos_sin_cache, - self.q_norm.weight, - self.k_norm.weight, - positions, - self.max_position, - self.num_heads, - self.num_kv_heads, - self.head_dim, - ) - else: - q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - # Add qk-norm - q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, - self.head_dim) - q_by_head = self.q_norm(q_by_head) - q = q_by_head.view(q.shape) - k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, - self.head_dim) - k_by_head = self.k_norm(k_by_head) - k = k_by_head.view(k.shape) - q, k = self.rotary_emb(positions, q, k) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + # Add qk-norm + q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, + self.head_dim) + q_by_head = self.q_norm(q_by_head) + q = q_by_head.view(q.shape) + k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, + self.head_dim) + k_by_head = self.k_norm(k_by_head) + k = k_by_head.view(k.shape) + q, k = self.rotary_emb(positions, q, k) attn_output = self.attn(q, k, v) output, _ = self.o_proj(attn_output) return output @@ -190,6 +175,9 @@ class Qwen3DecoderLayer(nn.Module): # Requires transformers > 4.32.0 rope_theta = getattr(config, "rope_theta", 1000000) rope_scaling = getattr(config, "rope_scaling", None) + dual_chunk_attention_config = getattr(config, + "dual_chunk_attention_config", + None) # By default, Qwen3 uses causal attention as it is a decoder-only model. # You can override the HF config with `is_causal=False` to enable @@ -214,6 +202,7 @@ class Qwen3DecoderLayer(nn.Module): rope_scaling=rope_scaling, prefix=f"{prefix}.self_attn", attn_type=attn_type, + dual_chunk_attention_config=dual_chunk_attention_config, ) self.mlp = Qwen3MLP( hidden_size=self.hidden_size, @@ -231,7 +220,6 @@ class Qwen3DecoderLayer(nn.Module): self, positions: torch.Tensor, hidden_states: torch.Tensor, - attn_metadata: AttentionMetadata, residual: Optional[torch.Tensor], ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention @@ -244,8 +232,6 @@ class Qwen3DecoderLayer(nn.Module): hidden_states = self.self_attn( positions=positions, hidden_states=hidden_states, - attn_metadata=attn_metadata, - residual=residual, ) # Fully Connected @@ -259,6 +245,7 @@ ALL_DECODER_LAYER_TYPES = { "attention": Qwen3DecoderLayer, } + @support_torch_compile( dynamic_arg_dims={ "input_ids": 0, @@ -268,189 +255,15 @@ ALL_DECODER_LAYER_TYPES = { "intermediate_tensors": 0, "inputs_embeds": 0, }) -class Qwen3Model(nn.Module): - """Qwen3Model""" - def __init__(self, - *, - vllm_config: VllmConfig, - prefix: str = "", - decoder_layer_type: type[nn.Module] = Qwen3DecoderLayer): - super().__init__() +class Qwen3Model(Qwen2Model): - config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, + prefix=prefix, + decoder_layer_type=Qwen3DecoderLayer) - # TODO (@robertgshaw2): see if this can be moved out - if (cache_config.sliding_window is not None - and hasattr(config, "max_window_layers")): - assert config.max_window_layers == config.num_hidden_layers, ( - "Sliding window for some but all layers is not supported. " - "This model uses sliding window but `max_window_layers` = {} " - "is less than `num_hidden_layers` = {}. Please open an issue " - "to discuss this feature.".format( - config.max_window_layers, - config.num_hidden_layers, - )) - self.config = config - self.quant_config = quant_config - self.vocab_size = config.vocab_size - - if get_pp_group().is_first_rank or (config.tie_word_embeddings - and get_pp_group().is_last_rank): - self.embed_tokens = VocabParallelEmbedding( - config.vocab_size, - config.hidden_size, - quant_config=quant_config, - prefix=f"{prefix}.embed_tokens", - ) - else: - self.embed_tokens = PPMissingLayer() - - # Use the provided decoder layer type or default to Qwen2DecoderLayer - decoder_layer_type = decoder_layer_type or Qwen3DecoderLayer - self.start_layer, self.end_layer, self.layers = make_layers( - config.num_hidden_layers, - lambda prefix: decoder_layer_type(config=config, - cache_config=cache_config, - quant_config=quant_config, - prefix=prefix), - prefix=f"{prefix}.layers", - ) - - self.make_empty_intermediate_tensors = ( - make_empty_intermediate_tensors_factory( - ["hidden_states", "residual"], config.hidden_size)) - if get_pp_group().is_last_rank: - self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - else: - self.norm = PPMissingLayer() - - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - """get_input_embeddings""" - return self.embed_tokens(input_ids) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, IntermediateTensors]: - """ - Args: - input_ids (torch.Tensor): Input sequence of shape `(batch, seq_len)`. - Indices are expected to be in the range `[0, config.vocab_size]`. - positions (torch.Tensor): Positional tensor of shape `(batch, seq_len)`. - intermediate_tensors (Optional[IntermediateTensors], optional): - Intermediate tensors from previous forward pass. Defaults to `None`. - inputs_embeds (Optional[torch.Tensor], optional): - Optionally, instead of positional embeddings, you can choose to - provide your own embedding lookup matrix of shape `(batch, seq_len, emb_dim)`. - If None, the model will create one on its own using the input ids. - Defaults to `None`. - - Returns: - Union[torch.Tensor, IntermediateTensors]: - If `intermediate_tensors` is not None, returns a IntermediateTensors object. - Otherwise, returns a tensor of shape `(batch, seq_len, hidden_size)` representing - the output of the last transformer encoder layer. - """ - forward_context: ForwardContext = get_forward_context() - attn_metadata = forward_context.attn_metadata - - if get_pp_group().is_first_rank: - if inputs_embeds is not None: - hidden_states = inputs_embeds - else: - hidden_states = self.get_input_embeddings(input_ids) - residual = None - else: - assert intermediate_tensors is not None - hidden_states = intermediate_tensors["hidden_states"] - residual = intermediate_tensors["residual"] - for i, layer in enumerate(self.layers[self.start_layer:self.end_layer], start=self.start_layer): - hidden_states, residual = layer( - positions, - hidden_states, - attn_metadata, - residual, - ) - if not get_pp_group().is_last_rank: - return IntermediateTensors({ - "hidden_states": hidden_states, - "residual": residual - }) - - hidden_states, _ = self.norm(hidden_states, residual) - - return hidden_states - - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - """Load model weights. - Args: - weights (Iterable[tuple[str, torch.Tensor]]): An iterator containing weight names and their corresponding values. - Returns (set[str]): - A set of already loaded weight names. - Exceptions: - None. - """ - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if "rotary_emb.inv_freq" in name: - continue - if (self.quant_config is not None and - (scale_name := self.quant_config.get_cache_scale(name))): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else - loaded_weight[0]) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - # Remapping the name of FP8 kv-scale. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - -class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): +class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -493,6 +306,13 @@ class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None: + self.model.aux_hidden_state_layers = layers + + def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]: + num_layers = len(self.model.layers) + return (2, num_layers // 2, num_layers - 3) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) @@ -502,7 +322,6 @@ class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): positions: torch.Tensor, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, - kv_caches: list[torch.Tensor] = None ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.model(input_ids, positions, intermediate_tensors, inputs_embeds) @@ -511,10 +330,8 @@ class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, @@ -525,6 +342,3 @@ class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): if self.config.tie_word_embeddings else None), ) return loader.load_weights(weights) - - -Qwen3ForSequenceClassification = as_seq_cls_model(Qwen3ForCausalLM) diff --git a/vllm_kunlun/models/qwen3_moe.py b/vllm_kunlun/models/qwen3_moe.py index 8250639..9f75c47 100644 --- a/vllm_kunlun/models/qwen3_moe.py +++ b/vllm_kunlun/models/qwen3_moe.py @@ -1,9 +1,14 @@ -# -# Copyright (c) 2025 Baidu, Inc. All Rights Reserved. -# Adapted from vllm/model_executor/models/qwen3_moe.py +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright 2024 The Qwen team. # Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. # -# This file is a part of the vllm-kunlun project. +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,49 +22,46 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen3MoE model compatible with HuggingFace weights.""" -import os -from collections.abc import Iterable -from typing import Any, Optional, Union, Tuple, Set +import typing +from collections.abc import Callable, Iterable +from itertools import islice +from typing import Any, Optional, Union import torch -import os from torch import nn -from transformers import PretrainedConfig from vllm_kunlun.ops.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig -from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size +from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config +from vllm.distributed import (get_ep_group, get_pp_group, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_gather) from vllm.logger import init_logger from vllm_kunlun.ops.activation import SiluAndMul from vllm_kunlun.ops.fused_moe.layer import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import ( - MergedColumnParallelLinear, - QKVParallelLinear, - RowParallelLinear, -) -from vllm_kunlun.ops.linear import ReplicatedLinear +from vllm_kunlun.ops.linear import (MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear, + ReplicatedLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead) from vllm_kunlun.ops.vocab_parallel_embedding import VocabParallelEmbedding -from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, maybe_remap_kv_scale_name) +from vllm.model_executor.models.utils import sequence_parallel_chunk from vllm.sequence import IntermediateTensors - -from vllm.model_executor.models.interfaces import SupportsPP -from vllm.model_executor.models.utils import ( - AutoWeightsLoader, - extract_layer_index, - is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, - make_layers, - maybe_prefix, -) from vllm_kunlun.ops.rotary_embedding import Split_Norm_Rope +from vllm.model_executor.models.interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP +from vllm.model_executor.models.utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index, + is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) + logger = init_logger(__name__) @@ -76,25 +78,19 @@ class Qwen3MoeMLP(nn.Module): ) -> None: super().__init__() self.gate_up_proj = MergedColumnParallelLinear( - hidden_size, - [intermediate_size] * 2, + hidden_size, [intermediate_size] * 2, bias=False, quant_config=quant_config, - prefix=f"{prefix}.gate_up_proj", - ) - self.down_proj = RowParallelLinear( - intermediate_size, - hidden_size, - bias=False, - quant_config=quant_config, - reduce_results=reduce_results, - prefix=f"{prefix}.down_proj", - ) + prefix=f"{prefix}.gate_up_proj") + self.down_proj = RowParallelLinear(intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config, + reduce_results=reduce_results, + prefix=f"{prefix}.down_proj") if hidden_act != "silu": - raise ValueError( - f"Unsupported activation: {hidden_act}. " - "Only silu is supported for now." - ) + raise ValueError(f"Unsupported activation: {hidden_act}. " + "Only silu is supported for now.") self.act_fn = SiluAndMul() def forward(self, x): @@ -108,66 +104,88 @@ class Qwen3MoeSparseMoeBlock(nn.Module): def __init__( self, - config: PretrainedConfig, - quant_config: Optional[QuantizationConfig] = None, + vllm_config: VllmConfig, prefix: str = "", ): super().__init__() + + config = vllm_config.model_config.hf_text_config + parallel_config = vllm_config.parallel_config + quant_config = vllm_config.quant_config + self.tp_size = get_tensor_model_parallel_world_size() + self.ep_group = get_ep_group().device_group + self.ep_rank = self.ep_group.rank() + self.ep_size = self.ep_group.size() + self.n_routed_experts = config.num_experts + + self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe + if self.tp_size > config.num_experts: raise ValueError( f"Tensor parallel size {self.tp_size} is greater than " - f"the number of experts {config.num_experts}." - ) + f"the number of experts {config.num_experts}.") - self.experts = FusedMoE( - num_experts=config.num_experts, - top_k=config.num_experts_per_tok, - hidden_size=config.hidden_size, - intermediate_size=config.moe_intermediate_size, - reduce_results=False, - renormalize=config.norm_topk_prob, - quant_config=quant_config, - prefix=f"{prefix}.experts", - ) - self.quant_config = quant_config - self.gate = ReplicatedLinear( - config.hidden_size, - config.num_experts, - bias=False, - quant_config=None, - prefix=f"{prefix}.gate", - ) + # Load balancing settings. + vllm_config = get_current_vllm_config() + eplb_config = vllm_config.parallel_config.eplb_config + self.enable_eplb = parallel_config.enable_eplb + + self.n_logical_experts = self.n_routed_experts + self.n_redundant_experts = eplb_config.num_redundant_experts + self.n_physical_experts = (self.n_logical_experts + + self.n_redundant_experts) + self.n_local_physical_experts = self.n_physical_experts // self.ep_size + + self.physical_expert_start = (self.ep_rank * + self.n_local_physical_experts) + self.physical_expert_end = (self.physical_expert_start + + self.n_local_physical_experts) + + self.experts = FusedMoE(num_experts=self.n_routed_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size, + reduce_results=True, + renormalize=config.norm_topk_prob, + quant_config=quant_config, + prefix=f"{prefix}.experts", + enable_eplb=self.enable_eplb, + num_redundant_experts=self.n_redundant_experts, + is_sequence_parallel=self.is_sequence_parallel) + + self.gate = ReplicatedLinear(config.hidden_size, + config.num_experts, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate") def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - # NOTE: hidden_states can have either 1D or 2D shape. - orig_shape = hidden_states.shape - hidden_dim = hidden_states.shape[-1] + assert hidden_states.dim( + ) <= 2, "Qwen3MoeSparseMoeBlock only supports 1D or 2D inputs" + is_input_1d = hidden_states.dim() == 1 + num_tokens, hidden_dim = hidden_states.shape hidden_states = hidden_states.view(-1, hidden_dim) - if self.quant_config is None: - kunlun_linear_weights = self.gate.get_weights() - final_hidden_states = self.experts( - hidden_states=hidden_states, linear_weights=kunlun_linear_weights - ) - else: - kunlun_linear_weights = self.gate.get_weights() - router_logits, _ = self.gate(hidden_states) - final_hidden_states = self.experts( - hidden_states=hidden_states, - router_logits=router_logits, - linear_weights=kunlun_linear_weights, - ) + if self.is_sequence_parallel: + hidden_states = sequence_parallel_chunk(hidden_states) - if self.tp_size > 1: - final_hidden_states = ( - self.experts.maybe_all_reduce_tensor_model_parallel( # noqa E501 - final_hidden_states - ) - ) + # router_logits: (num_tokens, n_experts) + router_logits, _ = self.gate(hidden_states) + kunlun_linear_weights = self.gate.get_weights() + final_hidden_states = self.experts(hidden_states=hidden_states, + router_logits=router_logits, + linear_weights=kunlun_linear_weights) - return final_hidden_states.view(orig_shape) + if self.is_sequence_parallel: + final_hidden_states = tensor_model_parallel_all_gather( + final_hidden_states, 0) + final_hidden_states = final_hidden_states[:num_tokens] + + # return to 1d if input is 1d + return final_hidden_states.squeeze(0) if is_input_1d else \ + final_hidden_states class Qwen3MoeAttention(nn.Module): @@ -186,6 +204,7 @@ class Qwen3MoeAttention(nn.Module): cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + dual_chunk_attention_config: Optional[dict[str, Any]] = None, ) -> None: super().__init__() self.hidden_size = hidden_size @@ -209,36 +228,29 @@ class Qwen3MoeAttention(nn.Module): self.scaling = self.head_dim**-0.5 self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings - if rope_scaling is not None: - scaling_factor = rope_scaling["factor"] - self.max_position_embeddings = int( - self.max_position_embeddings * scaling_factor - ) + self.dual_chunk_attention_config = dual_chunk_attention_config - self.qkv_proj = QKVParallelLinear( - hidden_size, - self.head_dim, - self.total_num_heads, - self.total_num_kv_heads, - bias=qkv_bias, - quant_config=quant_config, - prefix=f"{prefix}.qkv_proj", - ) + self.qkv_proj = QKVParallelLinear(hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=qkv_bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj") - self.o_proj = RowParallelLinear( - self.total_num_heads * self.head_dim, - hidden_size, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.o_proj", - ) + self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj") self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.head_dim, - max_position=self.max_position_embeddings, + max_position=max_position_embeddings, base=rope_theta, rope_scaling=rope_scaling, + dual_chunk_attention_config=dual_chunk_attention_config, ) self.attn = Attention( self.num_heads, @@ -248,6 +260,10 @@ class Qwen3MoeAttention(nn.Module): cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", + **{ + "layer_idx": extract_layer_index(prefix), + "dual_chunk_attention_config": dual_chunk_attention_config, + } if dual_chunk_attention_config else {}, ) self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps) @@ -259,33 +275,18 @@ class Qwen3MoeAttention(nn.Module): hidden_states: torch.Tensor, ) -> torch.Tensor: qkv, _ = self.qkv_proj(hidden_states) - if os.getenv("FUSED_QK_ROPE_OP") == "1": - # Rope fusion operators - q, k, v = Split_Norm_Rope( - qkv, - self.rotary_emb.cos_sin_cache, - self.q_norm.weight, - self.k_norm.weight, - positions, - self.max_position_embeddings, - self.num_heads, - self.num_kv_heads, - self.head_dim, - ) - else: - q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - # Add qk-norm - q_by_head = q.view( - *q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim - ) - q_by_head = self.q_norm(q_by_head) - q = q_by_head.view(q.shape) - k_by_head = k.view( - *k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim - ) - k_by_head = self.k_norm(k_by_head) - k = k_by_head.view(k.shape) - q, k = self.rotary_emb(positions, q, k) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + # Add qk-norm + q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, + self.head_dim) + q_by_head = self.q_norm(q_by_head) + q = q_by_head.view(q.shape) + + k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, + self.head_dim) + k_by_head = self.k_norm(k_by_head) + k = k_by_head.view(k.shape) + q, k = self.rotary_emb(positions, q, k) attn_output = self.attn(q, k, v) output, _ = self.o_proj(attn_output) return output @@ -293,18 +294,21 @@ class Qwen3MoeAttention(nn.Module): class Qwen3MoeDecoderLayer(nn.Module): - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() + + config = vllm_config.model_config.hf_text_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.hidden_size = config.hidden_size rope_theta = getattr(config, "rope_theta", 10000) rope_scaling = getattr(config, "rope_scaling", None) - max_position_embeddings = getattr(config, "max_position_embeddings", 8192) + max_position_embeddings = getattr(config, "max_position_embeddings", + 8192) + dual_chunk_attention_config = getattr(config, + "dual_chunk_attention_config", + None) self.self_attn = Qwen3MoeAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, @@ -313,36 +317,33 @@ class Qwen3MoeDecoderLayer(nn.Module): rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, rms_norm_eps=config.rms_norm_eps, - qkv_bias=getattr(config, "attention_bias", False), - head_dim=getattr(config, "head_dim", None), + qkv_bias=getattr(config, 'attention_bias', False), + head_dim=getattr(config, 'head_dim', None), cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", + dual_chunk_attention_config=dual_chunk_attention_config, ) # `mlp_only_layers` in the config. layer_idx = extract_layer_index(prefix) - mlp_only_layers = ( - [] if not hasattr(config, "mlp_only_layers") else config.mlp_only_layers - ) + mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else + config.mlp_only_layers) if (layer_idx not in mlp_only_layers) and ( - config.num_experts > 0 and (layer_idx + 1) % config.decoder_sparse_step == 0 - ): - self.mlp = Qwen3MoeSparseMoeBlock( - config=config, quant_config=quant_config, prefix=f"{prefix}.mlp" - ) + config.num_experts > 0 and + (layer_idx + 1) % config.decoder_sparse_step == 0): + self.mlp = Qwen3MoeSparseMoeBlock(vllm_config=vllm_config, + prefix=f"{prefix}.mlp") else: - self.mlp = Qwen3MoeMLP( - hidden_size=config.hidden_size, - intermediate_size=config.intermediate_size, - hidden_act=config.hidden_act, - quant_config=quant_config, - prefix=f"{prefix}.mlp", - ) - self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.post_attention_layernorm = RMSNorm( - config.hidden_size, eps=config.rms_norm_eps - ) + self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + prefix=f"{prefix}.mlp") + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) def forward( self, @@ -355,14 +356,16 @@ class Qwen3MoeDecoderLayer(nn.Module): residual = hidden_states hidden_states = self.input_layernorm(hidden_states) else: - hidden_states, residual = self.input_layernorm(hidden_states, residual) + hidden_states, residual = self.input_layernorm( + hidden_states, residual) hidden_states = self.self_attn( positions=positions, hidden_states=hidden_states, ) # Fully Connected - hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) hidden_states = self.mlp(hidden_states) return hidden_states, residual @@ -373,30 +376,30 @@ class Qwen3MoeModel(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config + config = vllm_config.model_config.hf_text_config quant_config = vllm_config.quant_config + parallel_config = vllm_config.parallel_config + eplb_config = parallel_config.eplb_config + self.num_redundant_experts = eplb_config.num_redundant_experts self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.config = config self.embed_tokens = VocabParallelEmbedding( - config.vocab_size, config.hidden_size, prefix=f"{prefix}.embed_tokens" - ) + config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.embed_tokens") self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: Qwen3MoeDecoderLayer( - config=config, - cache_config=cache_config, - quant_config=quant_config, - prefix=prefix, - ), + lambda prefix: Qwen3MoeDecoderLayer(vllm_config=vllm_config, + prefix=prefix), prefix=f"{prefix}.layers", ) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( - ["hidden_states", "residual"], config.hidden_size - ) + self.make_empty_intermediate_tensors = ( + make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size)) def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.embed_tokens(input_ids) @@ -418,17 +421,28 @@ class Qwen3MoeModel(nn.Module): assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for i in range(self.start_layer, self.end_layer): - layer = self.layers[i] + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer(positions, hidden_states, residual) if not get_pp_group().is_last_rank: - return IntermediateTensors( - {"hidden_states": hidden_states, "residual": residual} - ) + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual + }) hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + return FusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="gate_proj", + ckpt_down_proj_name="down_proj", + ckpt_up_proj_name="up_proj", + num_experts=self.config.num_experts, + num_redundant_experts=self.num_redundant_experts) + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -438,21 +452,16 @@ class Qwen3MoeModel(nn.Module): ("gate_up_proj", "up_proj", 1), ] - # Params for weights, fp8 weight scales, fp8 activation scales - # (param_name, weight_name, expert_id, shard_id) - expert_params_mapping = FusedMoE.make_expert_params_mapping( - ckpt_gate_proj_name="gate_proj", - ckpt_down_proj_name="down_proj", - ckpt_up_proj_name="up_proj", - num_experts=self.config.num_experts, - ) + # Skip loading extra parameters for GPTQ/modelopt models. + ignore_suffixes = (".bias", "_bias", ".k_scale", "_k_scale", + ".v_scale", "_v_scale", ".weight_scale", + "_weight_scale", ".input_scale", "_input_scale") params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() - weights_to_quantize = {} - + loaded_params: set[str] = set() + expert_params_mapping = self.get_expert_mapping() for name, loaded_weight in weights: - for param_name, weight_name, shard_id in stacked_params_mapping: + for (param_name, weight_name, shard_id) in stacked_params_mapping: # Skip non-stacked layers and experts (experts handled below). if weight_name not in name: continue @@ -465,73 +474,79 @@ class Qwen3MoeModel(nn.Module): if "mlp.experts" in name: continue name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if ( - name.endswith(".bias") or name.endswith("_bias") - ) and name not in params_dict: + + # Skip loading extra parameters for GPTQ/modelopt models. + if name.endswith(ignore_suffixes) and name not in params_dict: continue + # Skip layers on other devices. if is_pp_missing_parameter(name, self): continue + if name.endswith("scale"): + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue if name not in params_dict: continue param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - loaded_params.add(name) + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + if weight_loader == default_weight_loader: + weight_loader(param, loaded_weight) + else: + weight_loader(param, loaded_weight, shard_id) break else: + is_expert_weight = False for mapping in expert_params_mapping: param_name, weight_name, expert_id, shard_id = mapping if weight_name not in name: continue - # Map to the parameter name in the model + + # Anyway, this is an expert weight and should not be + # attempted to load as other weights later + is_expert_weight = True + + # Do not modify `name` since the loop may continue here + # Instead, create a new variable name_mapped = name.replace(weight_name, param_name) - # Layer/PP skip judgment if is_pp_missing_parameter(name_mapped, self): continue - if ( - name_mapped.endswith(".bias") or name_mapped.endswith("_bias") + + # Skip loading extra parameters for GPTQ/modelopt models. + if name_mapped.endswith( + ignore_suffixes ) and name_mapped not in params_dict: continue - # Get the param and target module - param = params_dict.get(name_mapped, None) - if param is None: + param = params_dict[name_mapped] + # We should ask the weight loader to return success or not + # here since otherwise we may skip experts with other + # available replicas. + weight_loader = typing.cast(Callable[..., bool], + param.weight_loader) + success = weight_loader(param, + loaded_weight, + name_mapped, + shard_id=shard_id, + expert_id=expert_id, + return_success=True) + if success: + name = name_mapped + break + else: + if is_expert_weight: + # We've checked that this is an expert weight + # However it's not mapped locally to this rank + # So we simply skip it continue - # === Only when the target MoE layer has int8 weights and scales, and the name matches, the "streaming quantization" is performed === - if self._should_stream_quantize(name_mapped): - # Note: Pass the mapped name_mapped instead of the original name - self._stream_quantize_moe_weight( - name_mapped, - param, - loaded_weight, - expert_id=expert_id, - shard_id=shard_id, - ) - loaded_params.add(name_mapped) - else: - # Fallback: Normal weight loading (non-quantized) - weight_loader = getattr( - param, "weight_loader", default_weight_loader - ) - weight_loader( - param, - loaded_weight, - name_mapped, - shard_id=shard_id, - expert_id=expert_id, - ) - loaded_params.add(name_mapped) - break - else: - # Skip loading extra bias for GPTQ models. - if ( - name.endswith(".bias") or name.endswith("_bias") - ) and name not in params_dict: + # Skip loading extra parameters for GPTQ/modelopt models. + if name.endswith( + ignore_suffixes) and name not in params_dict: continue # Skip layers on other devices. if is_pp_missing_parameter(name, self): @@ -539,241 +554,26 @@ class Qwen3MoeModel(nn.Module): # Remapping the name of FP8 kv-scale. if name.endswith("kv_scale"): remapped_kv_scale_name = name.replace( - ".kv_scale", ".attn.kv_scale" - ) + ".kv_scale", ".attn.kv_scale") if remapped_kv_scale_name not in params_dict: logger.warning_once( - "Found kv scale in the checkpoint " - f"(e.g. {name}), but not found the expected " - f"name in the model " - f"(e.g. {remapped_kv_scale_name}). " - "kv-scale is not loaded." + "Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.", # noqa: E501 + name, + remapped_kv_scale_name, ) continue else: name = remapped_kv_scale_name param = params_dict[name] - weight_loader = getattr( - param, "weight_loader", default_weight_loader - ) + weight_loader = getattr(param, "weight_loader", + default_weight_loader) weight_loader(param, loaded_weight) - loaded_params.add(name) - # loaded_params.add(name) + loaded_params.add(name) return loaded_params - def _is_moe_weight(self, name: str) -> bool: - """Check if the weight is MoE weight""" - return name.endswith("w13_weight") or name.endswith("w2_weight") - def _is_expert_complete(self, cache_key): - cache = self._moe_weight_cache.get(cache_key) - if cache is None: - return False - w13_ok = (0 in cache["w13_shards"]) and (1 in cache["w13_shards"]) - w2_ok = cache["w2_weight"] is not None - return w13_ok and w2_ok - - @torch.no_grad() - def _stream_quantize_moe_weight( - self, - param_name: str, - param: nn.Parameter, - loaded_weight: torch.Tensor, - *, - expert_id, - shard_id, - ): - - rank = os.environ.get("RANK", "0") - - # Ensure expert_id is an integer - try: - expert_id = int(expert_id) - except (ValueError, TypeError): - if isinstance(expert_id, str): - expert_id = int(expert_id) - - # Process shard_id - if isinstance(shard_id, str): - if shard_id in ("gate", "w1"): - shard_id = 0 - elif shard_id in ("up", "w3"): - shard_id = 1 - elif shard_id == "w2": - shard_id = 0 - else: - try: - shard_id = int(shard_id) - except ValueError: - shard_id = 0 - else: - shard_id = int(shard_id) - - # Initialize cache - if not hasattr(self, "_moe_weight_cache"): - self._moe_weight_cache = {} - self._expert_batch_count = 0 # Batch counter - - module_path = ".".join(param_name.split(".")[:-1]) - cache_key = (module_path, expert_id) - - cache = self._moe_weight_cache.get(cache_key) - if cache is None: - cache = { - "w13_shards": {}, - "w2_weight": None, - "target_module": self.get_submodule(module_path), - "done": False, - } - self._moe_weight_cache[cache_key] = cache - - if cache.get("done", False): - return - - # Cache weights (keep original precision) - if "w13_weight" in param_name: - cache["w13_shards"][shard_id] = loaded_weight.clone() - elif "w2_weight" in param_name: - cache["w2_weight"] = loaded_weight.clone() - - # Check if complete - if self._is_expert_complete(cache_key): - # Quantize this expert - self._quantize_expert_weights(cache_key) - cache["done"] = True - self._moe_weight_cache.pop(cache_key, None) - - # Force synchronization every 4 experts - self._expert_batch_count += 1 - if self._expert_batch_count % 4 == 0: - torch.cuda.synchronize() # Force synchronization - # print(f"[Rank {rank}] Completed batch of {self._expert_batch_count} experts") - - def _quantize_expert_weights(self, cache_key): - """Quantize the complete weights of an expert (supports TP sharding)""" - module_path, expert_id = cache_key - cache = self._moe_weight_cache[cache_key] - target_module = cache["target_module"] - - # Get TP config - from vllm.distributed import ( - get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, - ) - - tp_rank = get_tensor_model_parallel_rank() - tp_size = get_tensor_model_parallel_world_size() - - # Get actual shapes - E, twoN, H = target_module.w13_weight.shape - _, H2, N = target_module.w2_weight.shape - - qmax = 127.0 - - # Process w13_weight: concatenate gate and up - gate_weight = cache["w13_shards"][0] # [768, 2048] - up_weight = cache["w13_shards"][1] # [768, 2048] - - # TP sharding - if tp_size > 1: - # Calculate shard for each TP rank - gate_per_rank = gate_weight.shape[0] // tp_size - up_per_rank = up_weight.shape[0] // tp_size - - gate_start = tp_rank * gate_per_rank - gate_end = (tp_rank + 1) * gate_per_rank - up_start = tp_rank * up_per_rank - up_end = (tp_rank + 1) * up_per_rank - - gate_weight = gate_weight[gate_start:gate_end, :] # [192, 2048] - up_weight = up_weight[up_start:up_end, :] # [192, 2048] - - w13_complete = torch.cat([gate_weight, up_weight], dim=0) # [384, 2048] - - # Quantize w13_weight - w13_f = w13_complete.float() - w13_abs_max = torch.amax(torch.abs(w13_f), dim=-1) # [384] - w13_scale_2d = torch.clamp(w13_abs_max, min=1e-6) / qmax # [384] - w13_scale_3d = w13_scale_2d.unsqueeze(-1) # [384, 1] - w13_q = torch.round(w13_f / w13_scale_3d).clamp_(-128, 127).to(torch.int8) - - # Write w13_weight - target_module.w13_weight.data[expert_id, :, :].copy_( - w13_q.to(target_module.w13_weight.device) - ) - - # Update w13_scale - pre-multiply 127 - s = getattr(target_module, "w13_weight_scale") - s.data[expert_id, :].copy_((w13_scale_2d * 127.0).to(s.device)) - - # Process w2_weight - w2_weight = cache["w2_weight"] # [2048, 768] - - # TP sharding for w2 weight - if tp_size > 1: - w2_per_rank = w2_weight.shape[1] // tp_size - w2_start = tp_rank * w2_per_rank - w2_end = (tp_rank + 1) * w2_per_rank - w2_weight = w2_weight[:, w2_start:w2_end] # [2048, 192] - - w2_f = w2_weight.float() # [2048, 192] - w2_abs_max = torch.amax(torch.abs(w2_f), dim=-1) # [2048] - w2_scale_2d = torch.clamp(w2_abs_max, min=1e-6) / qmax # [2048] - w2_scale_3d = w2_scale_2d.unsqueeze(-1) # [2048, 1] - w2_q = torch.round(w2_f / w2_scale_3d).clamp_(-128, 127).to(torch.int8) - - # Write w2_weight - w2_param = getattr(target_module, "w2_weight") - w2_param.data[expert_id, :, :].copy_(w2_q.to(w2_param.device)) - - # Update w2_scale - pre-multiply 127 - w2_s = getattr(target_module, "w2_weight_scale") - w2_s.data[expert_id, :].copy_((w2_scale_2d * 127.0).to(w2_s.device)) - - # Clear cache - cache["w13_shards"].clear() - cache["w2_weight"] = None - - def _is_int8_moe_target_module(self, module_path: str) -> bool: - """Check if a module_path is a FusedMoE target using INT8(W8A8). - Determine by the actual existing parameters and dtype, not relying on quant_config names. - """ - try: - mod = self.get_submodule(module_path) - except Exception: - return False - # Need to have both int8 weights and float32 scales, and dimensions come from CompressedTensorsW8A8 path - if not ( - hasattr(mod, "w13_weight") - and hasattr(mod, "w2_weight") - and hasattr(mod, "w13_weight_scale") - and hasattr(mod, "w2_weight_scale") - ): - return False - try: - return ( - mod.w13_weight.dtype == torch.int8 - and mod.w2_weight.dtype == torch.int8 - and mod.w13_weight_scale.dtype == torch.float32 - and mod.w2_weight_scale.dtype == torch.float32 - ) - except Exception: - return False - - def _should_stream_quantize(self, param_name: str) -> bool: - """Only when (1) the parameter name corresponds to the MoE weights we defined; and - (2) the MoE layer is indeed the INT8 path (exists int8 weights + scales) - Stream quantization is enabled; otherwise, it falls back to the default loading. - """ - # First, determine if it is the MoE weight name we want to process (w13_weight / w2_weight) - if not self._is_moe_weight(param_name): - return False - # Then, check if the module containing this param is the INT8 path - module_path = ".".join(param_name.split(".")[:-1]) - return self._is_int8_moe_target_module(module_path) - - -class Qwen3MoeForCausalLM(nn.Module, SupportsPP): +class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, + MixtureOfExperts): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -790,22 +590,81 @@ class Qwen3MoeForCausalLM(nn.Module, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - config = vllm_config.model_config.hf_config + config = vllm_config.model_config.hf_text_config quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.model = Qwen3MoeModel( - vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") - ) - self.lm_head = ParallelLMHead( - config.vocab_size, config.hidden_size, quant_config=quant_config - ) + self.model = Qwen3MoeModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + self.lm_head = ParallelLMHead(config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head")) if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) self.make_empty_intermediate_tensors = ( - self.model.make_empty_intermediate_tensors - ) + self.model.make_empty_intermediate_tensors) + + # Set MoE hyperparameters + self.expert_weights = [] + + self.moe_layers: list[FusedMoE] = [] + example_layer = None + for layer in self.model.layers: + if isinstance(layer, PPMissingLayer): + continue + + assert isinstance(layer, Qwen3MoeDecoderLayer) + if isinstance(layer.mlp, Qwen3MoeSparseMoeBlock): + example_layer = layer.mlp + self.moe_layers.append(layer.mlp.experts) + + if example_layer is None: + raise RuntimeError("No Qwen3MoE layer found in the model.layers.") + + self.num_moe_layers = len(self.moe_layers) + self.num_expert_groups = 1 + self.num_shared_experts = 0 + self.num_logical_experts = example_layer.n_logical_experts + self.num_physical_experts = example_layer.n_physical_experts + self.num_local_physical_experts = example_layer.n_local_physical_experts + self.num_routed_experts = example_layer.n_routed_experts + self.num_redundant_experts = example_layer.n_redundant_experts + + def set_eplb_state( + self, + expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor, + ) -> None: + for layer_idx, layer in enumerate(self.moe_layers): + # Register the expert weights. + self.expert_weights.append(layer.get_expert_weights()) + layer.set_eplb_state( + moe_layer_idx=layer_idx, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count, + ) + + def update_physical_experts_metadata( + self, + num_physical_experts: int, + num_local_physical_experts: int, + ) -> None: + assert self.num_local_physical_experts == num_local_physical_experts + self.num_physical_experts = num_physical_experts + self.num_local_physical_experts = num_local_physical_experts + self.num_redundant_experts = (num_physical_experts - + self.num_logical_experts) + for layer in self.model.layers: + if isinstance(layer.mlp, Qwen3MoeSparseMoeBlock): + moe = layer.mlp + moe.n_local_physical_experts = num_local_physical_experts + moe.n_physical_experts = num_physical_experts + moe.n_redundant_experts = self.num_redundant_experts + moe.experts.update_expert_map() def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) @@ -816,21 +675,22 @@ class Qwen3MoeForCausalLM(nn.Module, SupportsPP): positions: torch.Tensor, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, - kv_caches: list[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: - hidden_states = self.model( - input_ids, positions, intermediate_tensors, inputs_embeds - ) + hidden_states = self.model(input_ids, positions, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) + + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + return self.model.get_expert_mapping() diff --git a/vllm_kunlun/models/qwen3_next.py b/vllm_kunlun/models/qwen3_next.py new file mode 100644 index 0000000..d8c0aac --- /dev/null +++ b/vllm_kunlun/models/qwen3_next.py @@ -0,0 +1,1335 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Inference-only Qwen3Next model.""" +from collections.abc import Iterable +from itertools import islice +from typing import Optional + +import torch +import torch.nn.functional as F +from einops import rearrange +from torch import nn +from transformers.activations import ACT2FN + +from vllm.attention import AttentionBackend, AttentionMetadata + +from vllm_kunlun.ops.attention.layer import Attention + +from vllm.compilation.decorators import support_torch_compile +from vllm.config import (CacheConfig, ModelConfig, SpeculativeConfig, + VllmConfig, get_current_vllm_config) +from vllm.distributed import (divide, get_ep_group, get_pp_group, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_gather) +from vllm.forward_context import ForwardContext, get_forward_context +from vllm.logger import init_logger +from vllm_kunlun.ops.fla import (fused_recurrent_gated_delta_rule, torch_chunk_gated_delta_rule, chunk_gated_delta_rule) +from vllm.model_executor.layers.fla.ops import ( + RMSNormGated) +from vllm_kunlun.ops.fused_moe.layer import FusedMoE +# yapf conflicts with isort for this block +# yapf: disable +from vllm.model_executor.layers.layernorm import ( + GemmaRMSNorm as Qwen3NextRMSNorm) +# yapf: enable +from vllm_kunlun.ops.linear import (ColumnParallelLinear, + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm_kunlun.ops.linear import ReplicatedLinear +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.mamba.abstract import MambaBase +from vllm.model_executor.layers.mamba.mamba_mixer2 import ( + mamba_v2_sharded_weight_loader) +from vllm.model_executor.layers.mamba.mamba_utils import ( + MambaStateDtypeCalculator, MambaStateShapeCalculator) +from vllm_kunlun.ops.mamba.causal_conv1d import ( + causal_conv1d_fn, causal_conv1d_update) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.vocab_parallel_embedding import ( + DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, sharded_weight_loader) +from vllm.model_executor.models.utils import sequence_parallel_chunk +from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform +from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.configs import Qwen3NextConfig +from vllm.triton_utils import tl, triton +from vllm.utils import direct_register_custom_op +from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata + +from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid, MixtureOfExperts, + SupportsLoRA, SupportsPP) +from vllm.model_executor.models.utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index, + is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) +from vllm_kunlun.ops.activation import SiluAndMul +from vllm_kunlun.ops._kunlun_ops import KunlunOps as ops + + +from typing import Optional, Union +from vllm.model_executor.layers.vocab_parallel_embedding import get_masked_input_and_mask + + +@torch.compile(dynamic=True, backend="aot_eager") +def get_masked_input_and_mask_kunlun( + input_: torch.Tensor, org_vocab_start_index: int, + org_vocab_end_index: int, num_org_vocab_padding: int, + added_vocab_start_index: int, + added_vocab_end_index: int) -> tuple[torch.Tensor, torch.Tensor]: + # torch.compile will fuse all of the pointwise ops below + # into a single kernel, making it very fast + org_vocab_mask = (input_ >= org_vocab_start_index) & ( + input_ < org_vocab_end_index) + added_vocab_mask = (input_ >= added_vocab_start_index) & ( + input_ < added_vocab_end_index) + added_offset = added_vocab_start_index - ( + org_vocab_end_index - org_vocab_start_index) - num_org_vocab_padding + valid_offset = (org_vocab_start_index * + org_vocab_mask) + (added_offset * added_vocab_mask) + vocab_mask = org_vocab_mask | added_vocab_mask + input_ = vocab_mask * (input_ - valid_offset) + return input_, ~vocab_mask + +get_masked_input_and_mask = get_masked_input_and_mask_kunlun + +logger = init_logger(__name__) + +KVCache = tuple[torch.Tensor, torch.Tensor] + + +class Qwen3NextMLP(nn.Module): + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + quant_config: Optional[QuantizationConfig] = None, + reduce_results: bool = True, + prefix: str = "", + ) -> None: + super().__init__() + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, [intermediate_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj") + self.down_proj = RowParallelLinear(intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config, + reduce_results=reduce_results, + prefix=f"{prefix}.down_proj") + if hidden_act != "silu": + raise ValueError(f"Unsupported activation: {hidden_act}. " + "Only silu is supported for now.") + self.act_fn = SiluAndMul() + + def forward(self, x): + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + +class Qwen3NextSparseMoeBlock(nn.Module): + + def __init__(self, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config = vllm_config.model_config.hf_config + parallel_config = vllm_config.parallel_config + quant_config = vllm_config.quant_config + + self.tp_size = get_tensor_model_parallel_world_size() + + self.ep_group = get_ep_group().device_group + self.ep_rank = self.ep_group.rank() + self.ep_size = self.ep_group.size() + self.n_routed_experts = config.num_experts + + self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe + + if self.tp_size > config.num_experts: + raise ValueError( + f"Tensor parallel size {self.tp_size} is greater than " + f"the number of experts {config.num_experts}.") + + # Load balancing settings. + vllm_config = get_current_vllm_config() + eplb_config = vllm_config.parallel_config.eplb_config + self.enable_eplb = parallel_config.enable_eplb + + self.n_logical_experts = self.n_routed_experts + self.n_redundant_experts = eplb_config.num_redundant_experts + self.n_physical_experts = (self.n_logical_experts + + self.n_redundant_experts) + self.n_local_physical_experts = self.n_physical_experts // self.ep_size + + self.physical_expert_start = (self.ep_rank * + self.n_local_physical_experts) + self.physical_expert_end = (self.physical_expert_start + + self.n_local_physical_experts) + + self.experts = FusedMoE(num_experts=self.n_routed_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size, + reduce_results=False, + renormalize=config.norm_topk_prob, + quant_config=quant_config, + prefix=f"{prefix}.experts", + enable_eplb=self.enable_eplb, + num_redundant_experts=self.n_redundant_experts, + is_sequence_parallel=self.is_sequence_parallel) + + self.gate = ReplicatedLinear(config.hidden_size, + config.num_experts, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate") + + if config.shared_expert_intermediate_size > 0: + self.shared_expert = Qwen3NextMLP( + hidden_size=config.hidden_size, + intermediate_size=config.shared_expert_intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + reduce_results=self.experts.must_reduce_shared_expert_outputs( + ), + prefix=f"{prefix}.shared_expert", + ) + else: + self.shared_expert = None + self.shared_expert_gate = torch.nn.Linear(config.hidden_size, + 1, + bias=False) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # NOTE: hidden_states can have either 1D or 2D shape. + orig_shape = hidden_states.shape + num_tokens, hidden_dim = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_dim) + + if self.is_sequence_parallel: + hidden_states = sequence_parallel_chunk(hidden_states) + + shared_output = None + if self.shared_expert is not None: + shared_output = self.shared_expert(hidden_states) + if self.shared_expert_gate is not None: + shared_output = F.sigmoid( + self.shared_expert_gate(hidden_states)) * shared_output + + # router_logits: (num_tokens, n_experts) + router_logits, _ = self.gate(hidden_states) + kunlun_linear_weights = self.gate.get_weights() + final_hidden_states = self.experts(hidden_states=hidden_states, + router_logits=router_logits, + linear_weights=kunlun_linear_weights) + + if shared_output is not None: + final_hidden_states = final_hidden_states + shared_output + + if self.is_sequence_parallel: + final_hidden_states = tensor_model_parallel_all_gather( + final_hidden_states, 0) + final_hidden_states = final_hidden_states[:num_tokens] + elif self.tp_size > 1: + final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( # noqa E501 + final_hidden_states) + + return final_hidden_states.view(orig_shape) + +class Qwen3NextGatedDeltaNet(nn.Module, MambaBase): + + @property + def mamba_type(self) -> str: + return "linear_attention" + + def get_attn_backend(self) -> type["AttentionBackend"]: + from vllm.v1.attention.backends.gdn_attn import GDNAttentionBackend + return GDNAttentionBackend + + def get_state_dtype(self) -> tuple[torch.dtype, torch.dtype]: + return MambaStateDtypeCalculator.gated_delta_net_state_dtype( + self.model_config.dtype, self.cache_config.mamba_cache_dtype) + + def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]: + return MambaStateShapeCalculator.gated_delta_net_state_shape( + self.tp_size, self.num_k_heads, self.num_v_heads, self.head_k_dim, + self.head_v_dim, self.conv_kernel_size, self.num_spec) + + def __init__( + self, + config: Qwen3NextConfig, + model_config: Optional[ModelConfig] = None, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + speculative_config: Optional[SpeculativeConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + self.hidden_size = config.hidden_size + self.num_v_heads = config.linear_num_value_heads + self.num_k_heads = config.linear_num_key_heads + self.head_k_dim = config.linear_key_head_dim + self.head_v_dim = config.linear_value_head_dim + self.key_dim = self.head_k_dim * self.num_k_heads + self.value_dim = self.head_v_dim * self.num_v_heads + + self.conv_kernel_size = config.linear_conv_kernel_dim + self.layer_idx = extract_layer_index(prefix) + self.activation = config.hidden_act + self.act = ACT2FN[config.hidden_act] + self.layer_norm_epsilon = config.rms_norm_eps + self.prefix = prefix + + self.config = config + self.model_config = model_config + self.cache_config = cache_config + self.quant_config = quant_config + self.speculative_config = speculative_config + self.num_spec = (self.speculative_config.num_speculative_tokens + if self.speculative_config else 0) + + # QKV + self.conv_dim = self.key_dim * 2 + self.value_dim + self.conv1d = ColumnParallelLinear( + input_size=self.conv_kernel_size, + output_size=self.conv_dim, + bias=False, + prefix=f"{prefix}.conv1d", + ) + self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1) + + # projection of the input hidden states + self.projection_size_qkvz = self.key_dim * 2 + self.value_dim * 2 + self.projection_size_ba = self.num_v_heads * 2 + self.in_proj_qkvz = ColumnParallelLinear( + input_size=self.hidden_size, + output_size=self.projection_size_qkvz, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.in_proj_qkvz", + ) + # ba_proj doesn't support blockwise fp8 quantization. + self.in_proj_ba = ColumnParallelLinear( + input_size=self.hidden_size, + output_size=self.projection_size_ba, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.in_proj_ba", + ) + + query_key_settings = (self.key_dim, 0, False) + value_settings = (self.value_dim, 0, False) + + delattr(self.conv1d.weight, "weight_loader") + set_weight_attrs( + self.conv1d.weight, { + "weight_loader": + mamba_v2_sharded_weight_loader([ + query_key_settings, + query_key_settings, + value_settings, + ], self.tp_size, self.tp_rank) + }) + + # selective projection used to make dt, B and C input dependant + + # time step projection (discretization) + # instantiate once and copy inv_dt in init_weights of PretrainedModel + self.dt_bias = nn.Parameter( + torch.ones(self.num_v_heads // self.tp_size), ) + self.A_log = nn.Parameter( + torch.empty( + divide(self.num_v_heads, self.tp_size), + dtype=torch.float32, + )) + + set_weight_attrs(self.A_log, + {"weight_loader": sharded_weight_loader(0)}) + set_weight_attrs(self.dt_bias, + {"weight_loader": sharded_weight_loader(0)}) + + self.norm = RMSNormGated( + self.head_v_dim, + eps=self.layer_norm_epsilon, + group_size=None, + norm_before_gate=True, + device=current_platform.current_device(), + dtype=config.torch_dtype, + ) + + self.out_proj = RowParallelLinear(self.value_dim, + self.hidden_size, + bias=False, + input_is_parallel=True, + quant_config=quant_config, + prefix=f"{prefix}.out_proj") + + compilation_config = get_current_vllm_config().compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") + compilation_config.static_forward_context[prefix] = self + + def fix_query_key_value_ordering( + self, + mixed_qkvz, + mixed_ba, + ): + """ + Derives `query`, `key` and `value` tensors from `mixed_qkvzba`. + """ + new_tensor_shape_qkvz = mixed_qkvz.size()[:-1] + ( + self.num_k_heads // self.tp_size, + (self.head_k_dim + self.head_k_dim + + (self.head_v_dim + self.head_v_dim) * self.num_v_heads // + self.num_k_heads), + ) + new_tensor_shape_ba = mixed_qkvz.size()[:-1] + ( + self.num_k_heads // self.tp_size, + 2 * self.num_v_heads // self.num_k_heads, + ) + + mixed_qkvz = mixed_qkvz.view(*new_tensor_shape_qkvz) + mixed_ba = mixed_ba.view(*new_tensor_shape_ba) + + split_arg_list_qkvz = [ + self.head_k_dim, + self.head_k_dim, + (self.num_v_heads // self.num_k_heads * self.head_v_dim), + (self.num_v_heads // self.num_k_heads * self.head_v_dim), + ] + split_arg_list_ba = [ + self.num_v_heads // self.num_k_heads, + self.num_v_heads // self.num_k_heads + ] + + # [b, sq, ng, (hn + hn + np/ng * hn + np/ng + np/ng)] + # --> [b, sq, ng, hn], [b, sq, ng, hn], [b, sq, ng, np/ng * hn], + # [b, sq, ng, np/ng * hn], [b, sq, ng, np/ng], [b, sq, ng, np/ng] + (query, key, value, z) = torch.split(mixed_qkvz, + split_arg_list_qkvz, + dim=2) + (b, a) = torch.split(mixed_ba, split_arg_list_ba, dim=2) + + # [b, sq, ng, np/ng * hn] -> [b, sq, np, hn] + value = value.reshape(value.size(0), -1, self.head_v_dim) + z = z.reshape(z.size(0), -1, self.head_v_dim) + b = b.reshape(b.size(0), self.num_v_heads // self.tp_size) + a = a.reshape(a.size(0), self.num_v_heads // self.tp_size) + + return query, key, value, z, b, a + + def rearrange_mixed_qkv(self, mixed_qkv): + if mixed_qkv is None: + return None, None, None + query, key, value = torch.split( + mixed_qkv, + [ + self.key_dim // self.tp_size, + self.key_dim // self.tp_size, + self.value_dim // self.tp_size, + ], + dim=-1, + ) + query, key = map( + lambda x: rearrange(x, 'l (h d) -> 1 l h d', d=self.head_k_dim), + (query, key)) + value = rearrange(value, 'l (h d) -> 1 l h d', d=self.head_v_dim) + return query, key, value + + def forward( + self, + hidden_states: torch.Tensor, + output: torch.Tensor, + ): + return torch.ops.vllm.gdn_attention( + hidden_states, + output, + self.prefix, + ) + + def _forward( + self, + hidden_states: torch.Tensor, + output: torch.Tensor, + ): + forward_context = get_forward_context() + attn_metadata: AttentionMetadata = forward_context.attn_metadata + + if attn_metadata is None: + # V1 profile run + return + + assert isinstance(attn_metadata, dict) + attn_metadata = attn_metadata[self.prefix] + assert isinstance(attn_metadata, GDNAttentionMetadata) + has_initial_state = attn_metadata.has_initial_state + spec_query_start_loc = attn_metadata.spec_query_start_loc + non_spec_query_start_loc = attn_metadata.non_spec_query_start_loc + spec_sequence_masks = attn_metadata.spec_sequence_masks + spec_token_masks = attn_metadata.spec_token_masks + spec_state_indices_tensor = attn_metadata.spec_state_indices_tensor # noqa: E501 + non_spec_state_indices_tensor = attn_metadata.non_spec_state_indices_tensor # noqa: E501 + self_kv_cache = self.kv_cache[forward_context.virtual_engine] + conv_state = self_kv_cache[0].transpose(-1, -2) + ssm_state = self_kv_cache[1] + num_actual_tokens = attn_metadata.num_actual_tokens + num_accepted_tokens = attn_metadata.num_accepted_tokens + if spec_token_masks is not None: + spec_token_masks = spec_token_masks[:num_actual_tokens] + + # 1. Set up dimensions for reshapes later + projected_states_qkvz, _ = self.in_proj_qkvz( + hidden_states[:num_actual_tokens]) + projected_states_ba, _ = self.in_proj_ba( + hidden_states[:num_actual_tokens]) + query, key, value, z, b, a = self.fix_query_key_value_ordering( + projected_states_qkvz, projected_states_ba) + query, key, value = map(lambda x: rearrange(x, 'l p d -> l (p d)'), + (query, key, value)) + mixed_qkv = torch.cat((query, key, value), dim=-1) + + # 2. Convolution sequence transformation + conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), + self.conv1d.weight.size(2)) + + if spec_sequence_masks is not None: + if (attn_metadata.num_prefills == 0 + and attn_metadata.num_decodes == 0): + mixed_qkv_spec = mixed_qkv + mixed_qkv_non_spec = None + else: + mixed_qkv_spec = mixed_qkv[spec_token_masks] + mixed_qkv_non_spec = mixed_qkv[~spec_token_masks] + else: + mixed_qkv_spec = None + mixed_qkv_non_spec = mixed_qkv + + # 2.1: process the mutli-query part + if spec_sequence_masks is not None: + mixed_qkv_spec = causal_conv1d_update( + mixed_qkv_spec, + conv_state, + conv_weights, + self.conv1d.bias, + self.activation, + conv_state_indices=spec_state_indices_tensor[:, 0] + [:attn_metadata.num_spec_decodes], + num_accepted_tokens=num_accepted_tokens, + query_start_loc=spec_query_start_loc, + max_query_len=spec_state_indices_tensor.size(-1), + validate_data=False, + ) + + # 2.2: process the remaining part + if attn_metadata.num_prefills > 0: + mixed_qkv_non_spec_T = mixed_qkv_non_spec.transpose(0, 1) + # - "cache_indices" updates the conv_state cache in positions + # pointed to by "state_indices_tensor" + mixed_qkv_non_spec = causal_conv1d_fn( + mixed_qkv_non_spec_T, + conv_weights, + self.conv1d.bias, + activation=self.activation, + conv_states=conv_state, + has_initial_state=has_initial_state, + cache_indices=non_spec_state_indices_tensor, + query_start_loc=non_spec_query_start_loc, + metadata=attn_metadata, + ).transpose(0, 1) + elif attn_metadata.num_decodes > 0: + mixed_qkv_non_spec = causal_conv1d_update( + mixed_qkv_non_spec, + conv_state, + conv_weights, + self.conv1d.bias, + self.activation, + conv_state_indices=non_spec_state_indices_tensor[:attn_metadata + .num_decodes], + validate_data=True, + ) + else: + mixed_qkv_non_spec = None + + query_spec, key_spec, value_spec = self.rearrange_mixed_qkv( + mixed_qkv_spec) + query_non_spec, key_non_spec, value_non_spec = self.rearrange_mixed_qkv( + mixed_qkv_non_spec) + + beta = b.sigmoid() + + g = ops.fused_gdn_gating(self.A_log.float(), a, self.dt_bias.float()) + g, beta = map(lambda x: rearrange(x, 'l d -> 1 l d'), (g, beta)) + + if spec_sequence_masks is not None: + if (attn_metadata.num_prefills == 0 + and attn_metadata.num_decodes == 0): + g_spec = g + beta_spec = beta + g_non_spec = None + beta_non_spec = None + else: + g_spec = g[:, spec_token_masks] + beta_spec = beta[:, spec_token_masks] + g_non_spec = g[:, ~spec_token_masks] + beta_non_spec = beta[:, ~spec_token_masks] + else: + g_spec = None + beta_spec = None + g_non_spec = g + beta_non_spec = beta + + # 3. Recurrent attention + + # 3.1: process the mutlti-query part + if spec_sequence_masks is not None: + core_attn_out_spec, last_recurrent_state = ( + fused_recurrent_gated_delta_rule( + q=query_spec, + k=key_spec, + v=value_spec, + g=g_spec, + beta=beta_spec, + initial_state=ssm_state, + inplace_final_state=True, + cu_seqlens=spec_query_start_loc[:attn_metadata. + num_spec_decodes + 1], + ssm_state_indices=spec_state_indices_tensor, + num_accepted_tokens=num_accepted_tokens, + use_qk_l2norm_in_kernel=True, + )) + else: + core_attn_out_spec, last_recurrent_state = None, None + + # 3.2: process the remaining part + if attn_metadata.num_prefills > 0: + initial_state = ssm_state[ + non_spec_state_indices_tensor].contiguous() + initial_state[~has_initial_state, ...] = 0 + if self.num_v_heads // self.num_k_heads > 1: + query_non_spec = query_non_spec.repeat_interleave(self.num_v_heads // self.num_k_heads, dim=2) + key_non_spec = key_non_spec.repeat_interleave(self.num_v_heads // self.num_k_heads, dim=2) + ( + core_attn_out_non_spec, + last_recurrent_state, + ) = chunk_gated_delta_rule( + q=query_non_spec, + k=key_non_spec, + v=value_non_spec, + g=g_non_spec, + beta=beta_non_spec, + initial_state=initial_state, + output_final_state=True, + use_qk_l2norm_in_kernel=True, + cu_seqlens=non_spec_query_start_loc, + ) + # Init cache + ssm_state[non_spec_state_indices_tensor] = last_recurrent_state.to( + ssm_state.dtype) + elif attn_metadata.num_decodes > 0: + core_attn_out_non_spec, last_recurrent_state = ( + fused_recurrent_gated_delta_rule( + q=query_non_spec, + k=key_non_spec, + v=value_non_spec, + g=g_non_spec, + beta=beta_non_spec, + initial_state=ssm_state, + inplace_final_state=True, + cu_seqlens=non_spec_query_start_loc[:attn_metadata. + num_decodes + 1], + ssm_state_indices=non_spec_state_indices_tensor, + use_qk_l2norm_in_kernel=True, + )) + else: + core_attn_out_non_spec, last_recurrent_state = None, None + + # Merge core attention output + if (spec_sequence_masks is not None + and core_attn_out_non_spec is not None): + core_attn_out = torch.empty( + (1, num_actual_tokens, *core_attn_out_spec.shape[2:]), + dtype=core_attn_out_non_spec.dtype, + device=core_attn_out_non_spec.device, + ) + core_attn_out[:, spec_token_masks] = core_attn_out_spec + core_attn_out[:, ~spec_token_masks] = core_attn_out_non_spec + elif spec_sequence_masks is not None: + core_attn_out = core_attn_out_spec + else: + core_attn_out = core_attn_out_non_spec + + z_shape_og = z.shape + # reshape input data into 2D tensor + core_attn_out = core_attn_out.reshape(-1, core_attn_out.shape[-1]) + z = z.reshape(-1, z.shape[-1]) + core_attn_out = self.norm(core_attn_out, z) + core_attn_out = core_attn_out.reshape(z_shape_og) + core_attn_out = rearrange(core_attn_out, '... h d -> ... (h d)') + + output[:num_actual_tokens], _ = self.out_proj(core_attn_out) + + +class Qwen3NextAttention(nn.Module): + + def __init__( + self, + config: Qwen3NextConfig, + model_config: Optional[ModelConfig] = None, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = config.num_attention_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = config.num_key_value_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = config.head_dim or (self.hidden_size // self.num_heads) + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.dual_chunk_attention_config = getattr( + config, "dual_chunk_attention_config", None) + self.attn_output_gate = getattr(config, "attn_output_gate", True) + + self.qkv_proj = QKVParallelLinear( + config.hidden_size, + self.head_dim, + self.total_num_heads * (1 + self.attn_output_gate), + self.total_num_kv_heads, + bias=getattr(config, "qkv_bias", False), + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + config.hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + + self.rotary_emb = get_rope( + head_size=self.head_dim, + rotary_dim=self.head_dim, + max_position=config.max_position_embeddings, + base=config.rope_theta, + rope_scaling=config.rope_scaling, + partial_rotary_factor=config.partial_rotary_factor, + dual_chunk_attention_config=self.dual_chunk_attention_config, + ) + + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn", + **{ + "layer_idx": extract_layer_index(prefix), + "dual_chunk_attention_config": + self.dual_chunk_attention_config, + } if self.dual_chunk_attention_config else {}, + ) + + self.q_norm = Qwen3NextRMSNorm(self.head_dim, eps=config.rms_norm_eps) + self.k_norm = Qwen3NextRMSNorm(self.head_dim, eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + output: torch.Tensor, + hidden_states: torch.Tensor, + ): + qkv, _ = self.qkv_proj(hidden_states) + + if self.attn_output_gate: + q_gate, k, v = qkv.split( + [self.q_size * 2, self.kv_size, self.kv_size], dim=-1) + orig_shape = q_gate.shape[:-1] + q_gate = q_gate.view(*orig_shape, self.num_heads, -1) + q, gate = torch.chunk(q_gate, 2, dim=-1) + q = q.reshape(*orig_shape, -1) + gate = gate.reshape(*orig_shape, -1) + else: + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], + dim=-1) + + q = self.q_norm(q.view(-1, self.num_heads, self.head_dim)).view( + -1, self.num_heads * self.head_dim) + k = self.k_norm(k.view(-1, self.num_kv_heads, self.head_dim)).view( + -1, self.num_kv_heads * self.head_dim) + + q, k = self.rotary_emb(positions, q, k) + + attn_output = self.attn(q, k, v) + + if self.attn_output_gate: + gate = torch.sigmoid(gate) + attn_output = attn_output * gate + + output[:], _ = self.o_proj(attn_output) + + +class Qwen3NextDecoderLayer(nn.Module): + + def __init__( + self, + vllm_config: VllmConfig, + layer_type: str, + prefix: str = "", + ) -> None: + super().__init__() + + config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + speculative_config = vllm_config.speculative_config + + self.layer_type = layer_type + self.layer_idx = extract_layer_index(prefix) + + if self.layer_type == "linear_attention": + self.linear_attn = Qwen3NextGatedDeltaNet( + config, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + speculative_config=speculative_config, + prefix=f'{prefix}.linear_attn') + elif self.layer_type == "full_attention": + self.self_attn = Qwen3NextAttention( + config, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + prefix=f'{prefix}.self_attn', + ) + else: + raise ValueError(f"Invalid layer_type {self.layer_type}") + + mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else + config.mlp_only_layers) + if (self.layer_idx not in mlp_only_layers) and ( + config.num_experts > 0 and + (self.layer_idx + 1) % config.decoder_sparse_step == 0): + self.mlp = Qwen3NextSparseMoeBlock( + vllm_config=vllm_config, + prefix=f"{prefix}.mlp", + ) + else: + self.mlp = Qwen3NextMLP( + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + ) + + self.input_layernorm = Qwen3NextRMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = Qwen3NextRMSNorm( + config.hidden_size, eps=config.rms_norm_eps) + + self.layer_scale = getattr(config, "layer_scale", False) + if self.layer_scale: + self.attn_layer_scale = torch.nn.Parameter( + torch.zeros( + 1, + 1, + config.hidden_size, + dtype=config.torch_dtype, + ), ) + self.ffn_layer_scale = torch.nn.Parameter( + torch.zeros( + 1, + 1, + config.hidden_size, + dtype=config.torch_dtype, + ), ) + + def forward( + self, + hidden_states: torch.Tensor, + residual: Optional[torch.Tensor], + positions: torch.Tensor = None, + **kwargs: object, + ): + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + + self_attention_output = torch.empty_like(hidden_states) + if self.layer_type == "linear_attention": + self.linear_attn( + hidden_states=hidden_states, + output=self_attention_output, + ) + elif self.layer_type == "full_attention": + self.self_attn( + hidden_states=hidden_states, + output=self_attention_output, + positions=positions, + ) + else: + raise ValueError("Invalid layer_type") + hidden_states = self_attention_output + + if self.layer_scale: + if len(hidden_states.shape) == 2: + hidden_states = hidden_states * ( + self.attn_layer_scale.to(hidden_states.dtype)[0] + 1) + else: + hidden_states = hidden_states * ( + self.attn_layer_scale.to(hidden_states.dtype) + 1) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + hidden_states = self.mlp(hidden_states) + + if self.layer_scale: + if len(hidden_states.shape) == 2: + hidden_states = hidden_states * ( + self.ffn_layer_scale.to(hidden_states.dtype)[0] + 1) + else: + assert len(hidden_states.shape) == len( + self.ffn_layer_scale.shape + ), f'shape must be the same {len(hidden_states.shape)}, {len(self.ffn_layer_scale.shape)}' # noqa: E501 + hidden_states = hidden_states * ( + self.ffn_layer_scale.to(hidden_states.dtype) + 1) + return hidden_states, residual + + +@support_torch_compile +class Qwen3NextModel(nn.Module): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config: Qwen3NextConfig = vllm_config.model_config.hf_config + parallel_config = vllm_config.parallel_config + lora_config = vllm_config.lora_config + eplb_config = parallel_config.eplb_config + self.num_redundant_experts = eplb_config.num_redundant_experts + + self.config = config + lora_vocab = ((lora_config.lora_extra_vocab_size * + (lora_config.max_loras or 1)) if lora_config else 0) + self.vocab_size = config.vocab_size + lora_vocab + + self.embed_tokens = VocabParallelEmbedding( + self.vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + ) + + def get_layer(prefix: str): + return Qwen3NextDecoderLayer( + vllm_config, + layer_type=config.layer_types[extract_layer_index(prefix)], + prefix=prefix, + ) + + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers") + self.make_empty_intermediate_tensors = ( + make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size)) + + if get_pp_group().is_last_rank: + self.norm = Qwen3NextRMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + else: + self.norm = PPMissingLayer() + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + for layer in islice(self.layers, self.start_layer, self.end_layer): + hidden_states, residual = layer( + positions=positions, + hidden_states=hidden_states, + residual=residual, + ) + + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual + }) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + return FusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="gate_proj", + ckpt_down_proj_name="down_proj", + ckpt_up_proj_name="up_proj", + num_experts=self.config.num_experts, + num_redundant_experts=self.num_redundant_experts) + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + expert_params_mapping = self.get_expert_mapping() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + + if name.startswith("mtp."): + continue + + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + + if "mlp.experts" in name: + continue + + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + # name = apply_attn_prefix(name, params_dict) + if name not in params_dict: + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + for mapping in expert_params_mapping: + param_name, weight_name, expert_id, shard_id = mapping + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + # Skip loading extra bias for GPTQ models. + if ((name.endswith(".bias") or name.endswith("_bias")) + and name not in params_dict): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, + loaded_weight, + name, + shard_id=shard_id, + expert_id=expert_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class Qwen3NextForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, + MixtureOfExperts, IsHybrid): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": ["gate_proj", "up_proj"], + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + config = vllm_config.model_config.hf_config + self.vllm_config = vllm_config + self.model_config = vllm_config.model_config + cache_config = vllm_config.cache_config + lora_config = vllm_config.lora_config + scheduler_config = vllm_config.scheduler_config + assert not cache_config.enable_prefix_caching, \ + "Qwen3Next currently does not support prefix caching" + self.quant_config = vllm_config.quant_config + + super().__init__() + self.config = config + self.scheduler_config = scheduler_config + self.model = Qwen3NextModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + self.unpadded_vocab_size = config.vocab_size + if lora_config: + self.unpadded_vocab_size += lora_config.lora_extra_vocab_size + self.lm_head = ParallelLMHead( + self.unpadded_vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + padding_size=DEFAULT_VOCAB_PADDING_SIZE + # We need bigger padding if using lora for kernel + # compatibility + if not lora_config else lora_config.lora_vocab_padding_size, + prefix=maybe_prefix(prefix, "lm_head")) + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, + config.vocab_size) + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + + # Set MoE hyperparameters + self.expert_weights = [] + + self.moe_layers: list[FusedMoE] = [] + example_layer = None + for layer in self.model.layers: + if isinstance(layer, PPMissingLayer): + continue + + assert isinstance(layer, Qwen3NextDecoderLayer) + if isinstance(layer.mlp, Qwen3NextSparseMoeBlock): + example_layer = layer.mlp + self.moe_layers.append(layer.mlp.experts) + + if example_layer is None: + raise RuntimeError("No Qwen3Next layer found in the model.layers.") + + self.num_moe_layers = len(self.moe_layers) + self.num_expert_groups = 1 + self.num_shared_experts = 0 + self.num_logical_experts = example_layer.n_logical_experts + self.num_physical_experts = example_layer.n_physical_experts + self.num_local_physical_experts = example_layer.n_local_physical_experts + self.num_routed_experts = example_layer.n_routed_experts + self.num_redundant_experts = example_layer.n_redundant_experts + + def set_eplb_state( + self, + expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor, + ) -> None: + for layer_idx, layer in enumerate(self.moe_layers): + # Register the expert weights. + self.expert_weights.append(layer.get_expert_weights()) + layer.set_eplb_state( + moe_layer_idx=layer_idx, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count, + ) + + def update_physical_experts_metadata( + self, + num_physical_experts: int, + num_local_physical_experts: int, + ) -> None: + assert self.num_local_physical_experts == num_local_physical_experts + self.num_physical_experts = num_physical_experts + self.num_local_physical_experts = num_local_physical_experts + self.num_redundant_experts = (num_physical_experts - + self.num_logical_experts) + for layer in self.model.layers: + if isinstance(layer.mlp, Qwen3NextSparseMoeBlock): + moe = layer.mlp + moe.n_local_physical_experts = num_local_physical_experts + moe.n_physical_experts = num_physical_experts + moe.n_redundant_experts = self.num_redundant_experts + moe.experts.update_expert_map() + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs: object, + ): + hidden_states = self.model(input_ids, positions, intermediate_tensors, + inputs_embeds) + + return hidden_states + + @classmethod + def get_mamba_state_dtype_from_config( + cls, + vllm_config: "VllmConfig", + ) -> tuple[torch.dtype, torch.dtype]: + return MambaStateDtypeCalculator.gated_delta_net_state_dtype( + vllm_config.model_config.dtype, + vllm_config.cache_config.mamba_cache_dtype) + + @classmethod + def get_mamba_state_shape_from_config( + cls, vllm_config: "VllmConfig" + ) -> tuple[tuple[int, int], tuple[int, int]]: + parallel_config = vllm_config.parallel_config + hf_config = vllm_config.model_config.hf_config + tp_size = parallel_config.tensor_parallel_size + num_spec = (vllm_config.speculative_config.num_speculative_tokens + if vllm_config.speculative_config else 0) + return MambaStateShapeCalculator.gated_delta_net_state_shape( + tp_size, hf_config.linear_num_key_heads, + hf_config.linear_num_value_heads, hf_config.linear_key_head_dim, + hf_config.linear_value_head_dim, hf_config.linear_conv_kernel_dim, + num_spec) + + def compute_logits( + self, + hidden_states: torch.Tensor, + ) -> Optional[torch.Tensor]: + return self.logits_processor(self.lm_head, hidden_states) + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader( + self, + skip_prefixes=["mtp."], + ) + return loader.load_weights(weights) + + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + return self.model.get_expert_mapping() + + +def gdn_attention( + hidden_states: torch.Tensor, + output: torch.Tensor, + layer_name: str, +) -> None: + forward_context: ForwardContext = get_forward_context() + self = forward_context.no_compile_layers[layer_name] + self._forward(hidden_states=hidden_states, output=output) + + +def gdn_attention_fake( + hidden_states: torch.Tensor, + output: torch.Tensor, + layer_name: str, +) -> None: + return + + +direct_register_custom_op( + op_name="gdn_attention", + op_func=gdn_attention, + mutates_args=["output"], + fake_impl=gdn_attention_fake, +) + + +# g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias) +@triton.jit +def fused_gdn_gating_kernel( + g, + A_log, + a, + dt_bias, + seq_len, + NUM_HEADS: tl.constexpr, + beta: tl.constexpr, + threshold: tl.constexpr, + BLK_HEADS: tl.constexpr, +): + i_b, i_s, i_d = tl.program_id(0), tl.program_id(1), tl.program_id(2) + head_off = i_d * BLK_HEADS + tl.arange(0, BLK_HEADS) + off = i_b * seq_len * NUM_HEADS + i_s * NUM_HEADS + head_off + mask = head_off < NUM_HEADS + blk_A_log = tl.load(A_log + head_off, mask=mask) + blk_a = tl.load(a + off, mask=mask) + blk_bias = tl.load(dt_bias + head_off, mask=mask) + # If the model is loaded in fp16, without the .float() here, A might be -inf + x = blk_a.to(tl.float32) + blk_bias.to(tl.float32) + softplus_x = tl.where(beta * x <= threshold, + (1 / beta) * tl.log(1 + tl.exp(beta * x)), x) + blk_g = -tl.exp(blk_A_log.to(tl.float32)) * softplus_x + tl.store(g + off, blk_g.to(g.dtype.element_ty), mask=mask) + + +def fused_gdn_gating( + A_log: torch.Tensor, + a: torch.Tensor, + dt_bias: torch.Tensor, + beta: float = 1.0, + threshold: float = 20.0, +) -> torch.Tensor: + batch, num_heads = a.shape + seq_len = 1 + grid = (batch, seq_len, triton.cdiv(num_heads, 8)) + g = torch.empty_like(a, dtype=torch.float32) + fused_gdn_gating_kernel[grid](g, + A_log, + a, + dt_bias, + seq_len, + num_heads, + beta, + threshold, + 8, + num_warps=1) + return g diff --git a/vllm_kunlun/models/qwen3_omni_moe_thinker.py b/vllm_kunlun/models/qwen3_omni_moe_thinker.py new file mode 100644 index 0000000..ad6e405 --- /dev/null +++ b/vllm_kunlun/models/qwen3_omni_moe_thinker.py @@ -0,0 +1,1780 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Copyright 2025 The Qwen team. +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only Qwen3-Omni-Moe model (thinker part).""" + +from collections.abc import Iterable, Mapping, Sequence +from functools import partial +from typing import Any, Callable, Optional, Union + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers import PretrainedConfig +from transformers.feature_extraction_utils import BatchFeature +from transformers.models.qwen3_omni_moe.configuration_qwen3_omni_moe import ( + Qwen3OmniMoeConfig, + Qwen3OmniMoeThinkerConfig, +) +from transformers.models.qwen3_omni_moe.modeling_qwen3_omni_moe import ( + Qwen3OmniMoeAudioEncoder, +) +from transformers.models.qwen3_omni_moe.processing_qwen3_omni_moe import ( + Qwen3OmniMoeProcessor, +) +from transformers.models.whisper import WhisperFeatureExtractor +from vllm.attention.layer import check_upstream_fa_availability +from vllm.compilation.decorators import support_torch_compile +from vllm.config import VllmConfig +from vllm.distributed import get_pp_group +from vllm.logger import init_logger +from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.interfaces import ( + MultiModalEmbeddings, + SupportsMRoPE, + SupportsMultiModal, + SupportsPP, +) + +# yapf conflicts with isort for this block +# yapf: disable +from vllm.model_executor.models.qwen2_5_omni_thinker import ( + Qwen2_5OmniConditionalGenerationMixin, + Qwen2_5OmniThinkerDummyInputsBuilder, + Qwen2_5OmniThinkerMultiModalProcessor, + Qwen2_5OmniThinkerProcessingInfo, +) +from vllm.model_executor.models.qwen2_audio import ( + Qwen2AudioFeatureInputs, + Qwen2AudioProcessingInfo, +) +from vllm.model_executor.models.utils import ( + AutoWeightsLoader, + WeightsMapper, + _merge_multimodal_embeddings, + maybe_prefix, +) +from vllm.model_executor.models.vision import get_vit_attn_backend +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import MultiModalKwargsItems +from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataItems +from vllm.multimodal.processing import ( + BaseMultiModalProcessor, + MultiModalPromptUpdates, + PlaceholderFeaturesInfo, + PromptReplacement, + PromptUpdate, +) +from vllm.platforms.interface import _Backend +from vllm.sequence import IntermediateTensors + +from vllm_kunlun.ops.linear import ( + ColumnParallelLinear, + MergedColumnParallelLinear, + QKVParallelLinear, + ReplicatedLinear, + RowParallelLinear, +) + +# yapf: enable +from .qwen2_5_vl import ( + Qwen2_5_VisionAttention, + Qwen2_5_VisionRotaryEmbedding, + Qwen2_5_VLProcessingInfo, +) +from .qwen3_moe import Qwen3MoeForCausalLM, Qwen3MoeModel + +try: + import flash_attn +except (ImportError, ModuleNotFoundError): + flash_attn = None + +logger = init_logger(__name__) + + +# form vllm-0.11.2 /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/vision.py +def get_llm_pos_ids_for_vision( + start_idx: int, + vision_idx: int, + spatial_merge_size: int, + t_index: list[int], + grid_hs: torch.Tensor, + grid_ws: torch.Tensor, +) -> torch.Tensor: + llm_pos_ids_list = [] + llm_grid_h = grid_hs[vision_idx] // spatial_merge_size + llm_grid_w = grid_ws[vision_idx] // spatial_merge_size + h_index = ( + torch.arange(llm_grid_h) + .view(1, -1, 1) + .expand(len(t_index), -1, llm_grid_w) + .flatten() + ) + w_index = ( + torch.arange(llm_grid_w) + .view(1, 1, -1) + .expand(len(t_index), llm_grid_h, -1) + .flatten() + ) + t_index_tensor = ( + torch.Tensor(t_index) + .to(llm_grid_h.device) + .view(-1, 1) + .expand(-1, llm_grid_h * llm_grid_w) + .long() + .flatten() + ) + _llm_pos_ids = torch.stack([t_index_tensor, h_index, w_index]) + llm_pos_ids_list.append(_llm_pos_ids + start_idx) + llm_pos_ids = torch.cat(llm_pos_ids_list, dim=1) + return llm_pos_ids + + +def _get_feat_extract_output_lengths(input_lengths: torch.Tensor): + input_lengths_leave = input_lengths % 100 + feat_lengths = (input_lengths_leave - 1) // 2 + 1 + output_lengths = ( + ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13 + ) + return feat_lengths, output_lengths + + +class Qwen3_VisionPatchEmbed(nn.Module): + def __init__( + self, + patch_size: int = 14, + temporal_patch_size: int = 2, + in_channels: int = 3, + hidden_size: int = 1152, + ) -> None: + super().__init__() + self.patch_size = patch_size + self.temporal_patch_size = temporal_patch_size + self.hidden_size = hidden_size + + kernel_size = (temporal_patch_size, patch_size, patch_size) + self.proj = nn.Conv3d( + in_channels, + hidden_size, + kernel_size=kernel_size, + stride=kernel_size, + bias=True, + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + L, C = x.shape + x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size) + x = self.proj(x).view(L, self.hidden_size) + return x + + +class Qwen3_VisionMLP(nn.Module): + def __init__( + self, + in_features: int, + hidden_features: int, + bias: bool = False, + act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + super().__init__() + self.linear_fc1 = ColumnParallelLinear( + in_features, + hidden_features, + bias=bias, + quant_config=quant_config, + return_bias=False, + prefix=f"{prefix}.linear_fc1", + ) + self.linear_fc2 = RowParallelLinear( + hidden_features, + in_features, + bias=bias, + quant_config=quant_config, + return_bias=False, + prefix=f"{prefix}.linear_fc2", + ) + self.act_fn = act_fn + + def forward(self, x: torch.Tensor): + mlp_output = self.linear_fc2(self.act_fn(self.linear_fc1(x))) + return mlp_output + + +class Qwen3_VisionBlock(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + mlp_hidden_dim: int, + act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu, + norm_layer: Optional[Callable[[int], nn.Module]] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + if norm_layer is None: + norm_layer = partial(nn.LayerNorm, eps=1e-6) + self.norm1 = norm_layer(dim) + self.norm2 = norm_layer(dim) + self.attn = Qwen2_5_VisionAttention( + embed_dim=dim, + num_heads=num_heads, + projection_size=dim, + quant_config=quant_config, + prefix=f"{prefix}.attn", + ) + self.mlp = Qwen3_VisionMLP( + dim, + mlp_hidden_dim, + act_fn=act_fn, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + + def forward( + self, + x: torch.Tensor, + cu_seqlens: torch.Tensor, + rotary_pos_emb: torch.Tensor, + max_seqlen: Optional[int] = None, # Only used for Flash Attention + seqlens: Optional[list[int]] = None, # Only used for xFormers + ) -> torch.Tensor: + x = x + self.attn( + self.norm1(x), + cu_seqlens=cu_seqlens, + rotary_pos_emb=rotary_pos_emb, + max_seqlen=max_seqlen, + seqlens=seqlens, + ) + + x = x + self.mlp(self.norm2(x)) + return x + + +class Qwen3_VisionPatchMerger(nn.Module): + def __init__( + self, + d_model: int, + context_dim: int, + norm_layer: Optional[Callable[[int], nn.Module]] = None, + spatial_merge_size: int = 2, + use_postshuffle_norm: bool = False, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = context_dim * (spatial_merge_size**2) + + self.use_postshuffle_norm = use_postshuffle_norm + if self.use_postshuffle_norm: + context_dim = self.hidden_size + + if norm_layer is None: + norm_layer = partial(nn.LayerNorm, eps=1e-6) + self.use_postshuffle_norm = use_postshuffle_norm + self.ln_q = norm_layer( + self.hidden_size if use_postshuffle_norm else context_dim + ) + self.mlp = nn.ModuleList( + [ + ColumnParallelLinear( + self.hidden_size, + self.hidden_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.mlp.0", + ), + nn.GELU(), + RowParallelLinear( + self.hidden_size, + d_model, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.mlp.2", + ), + ] + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if self.use_postshuffle_norm: + x = self.ln_q(x.view(-1, self.hidden_size)) + else: + x = self.ln_q(x).view(-1, self.hidden_size) + + mlp_fc1, mlp_act, mlp_fc2 = self.mlp + x_parallel, _ = mlp_fc1(x) + x_parallel = mlp_act(x_parallel) + out, _ = mlp_fc2(x_parallel) + return out + + +class Qwen3Omni_VisionTransformer(nn.Module): + def __init__( + self, + vision_config, + norm_eps: float = 1e-6, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = vision_config.hidden_size + self.num_heads = vision_config.num_heads + self.image_size = vision_config.image_size + self.patch_size = vision_config.patch_size + self.spatial_merge_size = vision_config.spatial_merge_size + self.spatial_merge_unit = self.spatial_merge_size**2 + self.temporal_patch_size = vision_config.temporal_patch_size + self.num_grid_per_side = self.image_size // self.patch_size + self.apply_vit_abs_pos_embed = vision_config.apply_vit_abs_pos_embed + self.deepstack_visual_indexes = vision_config.deepstack_visual_indexes + + self.patch_embed = Qwen3_VisionPatchEmbed( + patch_size=self.patch_size, + temporal_patch_size=self.temporal_patch_size, + in_channels=vision_config.in_channels, + hidden_size=self.hidden_size, + ) + + # vit pos embeding, TODO: spatial_patch_size vs patch_size + if self.apply_vit_abs_pos_embed: + self.pos_embed = nn.Embedding(self.num_grid_per_side**2, self.hidden_size) + else: + self.pos_embed = nn.Parameter( + torch.empty([1, self.num_grid_per_side**2, self.hidden_size]) + ) + + norm_layer = partial(nn.LayerNorm, eps=norm_eps) + head_dim = self.hidden_size // self.num_heads + self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2) + + self.blocks = nn.ModuleList( + [ + Qwen3_VisionBlock( + dim=self.hidden_size, + num_heads=self.num_heads, + mlp_hidden_dim=vision_config.intermediate_size, + act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act], + norm_layer=norm_layer, + quant_config=quant_config, + prefix=f"{prefix}.blocks.{layer_idx}", + ) + for layer_idx in range(vision_config.depth) + ] + ) + self.merger = Qwen3_VisionPatchMerger( + d_model=vision_config.out_hidden_size, + context_dim=self.hidden_size, + norm_layer=norm_layer, + spatial_merge_size=self.spatial_merge_size, + quant_config=quant_config, + prefix=f"{prefix}.merger", + ) + if self.deepstack_visual_indexes is not None: + self.merger_list = nn.ModuleList( + [ + Qwen3_VisionPatchMerger( + d_model=vision_config.out_hidden_size, + context_dim=self.hidden_size, + spatial_merge_size=self.spatial_merge_size, + use_postshuffle_norm=True, + norm_layer=norm_layer, + quant_config=quant_config, + prefix=f"{prefix}.merger_list.{layer_idx}", + ) + for layer_idx in range(len(self.deepstack_visual_indexes)) + ] + ) + + self.attn_backend = get_vit_attn_backend( + head_size=head_dim, dtype=torch.get_default_dtype() + ) + if self.attn_backend != _Backend.FLASH_ATTN and check_upstream_fa_availability( + torch.get_default_dtype() + ): + self.attn_backend = _Backend.FLASH_ATTN + + @property + def dtype(self) -> torch.dtype: + return self.patch_embed.proj.weight.dtype + + @property + def device(self) -> torch.device: + return self.patch_embed.proj.weight.device + + def rot_pos_emb(self, grid_thw): + pos_ids = [] + for t, h, w in grid_thw: + hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) + hpos_ids = hpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ) + hpos_ids = hpos_ids.permute(0, 2, 1, 3) + hpos_ids = hpos_ids.flatten() + + wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) + wpos_ids = wpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ) + wpos_ids = wpos_ids.permute(0, 2, 1, 3) + wpos_ids = wpos_ids.flatten() + pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) + pos_ids = torch.cat(pos_ids, dim=0) + max_grid_size = grid_thw[:, 1:].max() + rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) + rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) + return rotary_pos_emb + + def fast_pos_embed_interpolate(self, grid_thw: list[list[int]]) -> torch.Tensor: + num_grid_per_side = self.num_grid_per_side + m_size = self.spatial_merge_size + hidden_dim = self.pos_embed.embedding_dim + + outputs = [] + for t, h, w in grid_thw: + h_idxs = torch.linspace( + 0, num_grid_per_side - 1, h, dtype=torch.float32, device=self.device + ) + w_idxs = torch.linspace( + 0, num_grid_per_side - 1, w, dtype=torch.float32, device=self.device + ) + + h_floor = h_idxs.to(torch.long) + w_floor = w_idxs.to(torch.long) + h_ceil = torch.clamp(h_floor + 1, max=num_grid_per_side - 1) + w_ceil = torch.clamp(w_floor + 1, max=num_grid_per_side - 1) + + dh = h_idxs - h_floor + dw = w_idxs - w_floor + + # Create meshgrid view for all h, w vars + dh_grid, dw_grid = torch.meshgrid(dh, dw, indexing="ij") + h_floor_grid, w_floor_grid = torch.meshgrid(h_floor, w_floor, indexing="ij") + h_ceil_grid, w_ceil_grid = torch.meshgrid(h_ceil, w_ceil, indexing="ij") + h_floor_grid_idx = h_floor_grid * num_grid_per_side + h_ceil_grid_idx = h_ceil_grid * num_grid_per_side + + # original computation of weights + # w00 = (1 - dh_grid) * (1 - dw_grid) + # w01 = (1 - dh_grid) * dw_grid + # w10 = dh_grid * (1 - dw_grid) + # w11 = dh_grid * dw_grid + # we reuse w11 here to avoid duplicate + # dh_grid * dw_grid computation + w11 = dh_grid * dw_grid + w10 = dh_grid - w11 + w01 = dw_grid - w11 + w00 = 1 - dh_grid - dw_grid + w11 + + idx00 = h_floor_grid_idx + w_floor_grid + idx01 = h_floor_grid_idx + w_ceil_grid + idx10 = h_ceil_grid_idx + w_floor_grid + idx11 = h_ceil_grid_idx + w_ceil_grid + + indices = torch.stack([idx00, idx01, idx10, idx11], dim=0).reshape(4, -1) + weights = torch.stack([w00, w01, w10, w11], dim=0).reshape(4, -1, 1) + weights = weights.to(dtype=self.dtype, device=self.device) + + embeds = self.pos_embed(indices) + weighted_embeds = embeds * weights + p0, p1, p2, p3 = weighted_embeds.unbind(dim=0) + combined = p0 + p1 + p2 + p3 + + combined = combined.view(h * w, hidden_dim) + repeated = combined.unsqueeze(0).expand(t, -1, -1).contiguous() + repeated = repeated.view( + t, h // m_size, m_size, w // m_size, m_size, hidden_dim + ) + repeated = repeated.permute(0, 1, 3, 2, 4, 5).reshape(-1, hidden_dim) + outputs.append(repeated) + + return torch.cat(outputs, dim=0) + + def compute_attn_mask_seqlen( + self, + cu_seqlens: torch.Tensor, + ) -> tuple[Optional[int], Optional[list[int]]]: + max_seqlen, seqlens = None, None + if self.attn_backend == _Backend.FLASH_ATTN: + max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() + elif self.attn_backend == _Backend.XFORMERS: + seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() + elif self.attn_backend == _Backend.TORCH_SDPA: + max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() + return max_seqlen, seqlens + + def forward( + self, + x: torch.Tensor, + grid_thw: list[list[int]], + ) -> torch.Tensor: + hidden_states = x.to(device=self.device, dtype=self.dtype) + hidden_states = self.patch_embed(hidden_states) + + if self.apply_vit_abs_pos_embed: + pos_embeds = self.fast_pos_embed_interpolate(grid_thw) + hidden_states = hidden_states + pos_embeds + rotary_pos_emb = self.rot_pos_emb(grid_thw) + + cu_seqlens = torch.repeat_interleave( + grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0] + ).cumsum( + dim=0, + dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32, + ) + cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0) + + hidden_states = hidden_states.unsqueeze(1) + rotary_pos_emb = rotary_pos_emb.to(hidden_states.device) + max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens) + + hidden_states_list = [] + deepstack_visual_indexes = self.deepstack_visual_indexes + + for layer_num, blk in enumerate(self.blocks): + hidden_states = blk( + hidden_states, + cu_seqlens=cu_seqlens, + rotary_pos_emb=rotary_pos_emb, + max_seqlen=max_seqlen, + seqlens=seqlens, + ) + if ( + deepstack_visual_indexes is not None + and layer_num in deepstack_visual_indexes + ): + hidden_states_list.append(hidden_states) + + hidden_states = self.merger(hidden_states) + + # processing deepstack + if deepstack_visual_indexes is not None: + processed_hidden_states_list = [hidden_states] + for idx, x in enumerate(hidden_states_list): + x = self.merger_list[idx](x) + processed_hidden_states_list.append(x) + # we cat the original visual features and deepstack features + # along the feature dim + hidden_states = torch.cat( + processed_hidden_states_list, dim=1 + ) # [seq_len, hidden_size * (1 + depth_of_deepstack)] + + return hidden_states + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("attn.qkv.", "attn.q.", "q"), + ("attn.qkv.", "attn.k.", "k"), + ("attn.qkv.", "attn.v.", "v"), + ] + params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: set[str] = set() + + for name, loaded_weight in weights: + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +@support_torch_compile( + dynamic_arg_dims={ + "input_ids": 0, + "positions": -1, + "intermediate_tensors": 0, + "inputs_embeds": 0, + "deepstack_input_embeds": 0, + } +) +class Qwen3MoeLLMModel(Qwen3MoeModel): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + + self.deepstack_multiscale_layer_start = 1 + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + deepstack_input_embeds: Optional[IntermediateTensors] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + for layer_idx, layer in enumerate( + self.layers[self.start_layer : self.end_layer] + ): + layer_idx = layer_idx + self.start_layer + + hidden_states, residual = layer( + positions, + hidden_states, + residual, + ) + + if deepstack_input_embeds is not None and layer_idx in range( + 0, len(deepstack_input_embeds) + ): + hidden_states = ( + hidden_states + + deepstack_input_embeds[f"deepstack_input_embeds_{layer_idx}"] + ) + + if not get_pp_group().is_last_rank: + return IntermediateTensors( + {"hidden_states": hidden_states, "residual": residual} + ) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + +class Qwen3MoeLLMForCausalLM(Qwen3MoeForCausalLM): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super(Qwen3MoeForCausalLM, self).__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config + self.quant_config = quant_config + self.model = Qwen3MoeLLMModel( + vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") + ) + self.lm_head = ParallelLMHead( + config.vocab_size, config.hidden_size, quant_config=quant_config + ) + if self.config.tie_word_embeddings: + self.lm_head.weight = self.model.embed_tokens.weight + self.logits_processor = LogitsProcessor(config.vocab_size) + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors + ) + + +class Qwen3OmniMoeThinkerProcessingInfo( + Qwen2AudioProcessingInfo, Qwen2_5_VLProcessingInfo +): + def get_hf_config(self): + return self.ctx.get_hf_config(Qwen3OmniMoeConfig).thinker_config + + def get_hf_processor(self, **kwargs: object) -> Qwen3OmniMoeProcessor: + processor = self.ctx.get_hf_processor( + Qwen3OmniMoeProcessor, + use_fast=kwargs.pop("use_fast", True), + **kwargs, + ) + if not hasattr(processor, "audio_token"): + processor.audio_token = "<|audio_pad|>" + if not hasattr(processor, "image_token"): + processor.image_token = "<|image_pad|>" + if not hasattr(processor, "video_token"): + processor.video_token = "<|video_pad|>" + return processor + + def get_feature_extractor(self, **kwargs: object): + hf_processor = self.get_hf_processor(**kwargs) + feature_extractor = hf_processor.feature_extractor # type: ignore + assert isinstance(feature_extractor, WhisperFeatureExtractor) + return feature_extractor + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"audio": None, "image": None, "video": None} + + +Qwen3OmniMoeThinkerDummyInputsBuilder = Qwen2_5OmniThinkerDummyInputsBuilder + + +class Qwen3OmniMoeThinkerMultiModalProcessor( + Qwen2_5OmniThinkerMultiModalProcessor, +): + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + tok_kwargs: Mapping[str, object], + ) -> BatchFeature: + mm_data = dict(mm_data) + audios = mm_data.pop("audios", []) + + def pad_to_hop_length(x: np.ndarray, hop_length: int) -> np.ndarray: + length = x.shape[-1] + if length % hop_length != 0: + pad_length = hop_length - (length % hop_length) + x = np.pad(x, (0, pad_length), mode="constant", constant_values=0) + return x + + # NOTE: WhisperFeatureExtractor cannot handle empty list of audios + if audios: + # NOTE: Qwen3-Omni processor accept "audio" + # To make sure the cache works with padding=True, we pre-padded + # the audio to multiple of hop_length. + hop_length = self.info.get_feature_extractor().hop_length + mm_data["audio"] = [ + pad_to_hop_length(audio, hop_length) + if isinstance(audio, np.ndarray) + else (pad_to_hop_length(audio[0], hop_length), audio[1]) + for audio in audios + ] + mm_kwargs = dict( + **mm_kwargs, + ) + + hf_inputs = super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + tok_kwargs=tok_kwargs, + ) + + if ( + "audio_feature_lengths" in hf_inputs + and "feature_attention_mask" in hf_inputs + and (audios := mm_data.get("audio", [])) + ): + hop_length = self.info.get_feature_extractor().hop_length + audio_num_frames = [] + for _, audio in enumerate(audios): + audio_length = len(audio[0]) if isinstance(audio, tuple) else len(audio) + num_frame = ( + (audio_length // hop_length) + if audio_length % hop_length == 0 + else (audio_length // hop_length - 1) + ) + audio_num_frames.append(num_frame) + hf_inputs["feature_attention_mask"] = [ + torch.ones(num_frame) for num_frame in audio_num_frames + ] + hf_inputs["audio_feature_lengths"] = torch.tensor(audio_num_frames) + return hf_inputs + + def _maybe_apply_prompt_updates( + self, + mm_items: MultiModalDataItems, + prompt_ids: list[int], + mm_kwargs: MultiModalKwargsItems, + mm_prompt_updates: MultiModalPromptUpdates, + is_update_applied: bool, + ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]: + """ + Qwen3-Omni reimplements this function to handle `use_audio_in_video`. + """ + mm_item_counts = mm_items.get_all_counts() + self._validate_mm_kwargs(mm_kwargs, mm_item_counts) + + use_audio_in_video = False + if "video" in mm_kwargs: + for item in mm_kwargs["video"]: + if item and item["use_audio_in_video"].data: + use_audio_in_video = True + else: + use_audio_in_video = False + + if use_audio_in_video and "video" in mm_item_counts: + assert "audio" in mm_item_counts + mm_item_counts["audio"] -= mm_item_counts["video"] + + # Special case with `use_audio_in_video=True` + if use_audio_in_video: + if is_update_applied: + prompt_ids = self._get_raw_input_ids(prompt_ids, use_audio_in_video) + ( + prompt_ids, + new_text, + mm_placeholders, + ) = self._apply_prompt_updates( + prompt_ids, + mm_prompt_updates, + ) + self._validate_mm_placeholders(mm_placeholders, mm_item_counts) + # normal case with `use_audio_in_video=False` + elif is_update_applied: + mm_placeholders = self._find_mm_placeholders( + prompt_ids, + mm_prompt_updates, + ) + self._validate_mm_placeholders( + mm_placeholders, + mm_item_counts, + ) + else: + prompt_ids, new_text, mm_placeholders = self._apply_prompt_updates( + prompt_ids, + mm_prompt_updates, + ) + self._validate_mm_placeholders( + mm_placeholders, + mm_item_counts, + ) + + return prompt_ids, new_text, mm_placeholders + + def get_updates_use_audio_in_video( + self, + thinker_config: PretrainedConfig, + audio_len: int, + video_grid_thw: Union[list[int], torch.Tensor], + video_second_per_grid_t: float, + ) -> list[int]: + shift = 0 + audio_token_id = thinker_config.audio_token_id + video_token_id = thinker_config.video_token_id + audio_start_token_id = thinker_config.audio_start_token_id + audio_end_token_id = thinker_config.audio_end_token_id + spatial_merge_size = thinker_config.vision_config.spatial_merge_size + position_id_per_seconds = thinker_config.position_id_per_seconds + audio_token_indices = np.arange(next(iter([audio_len]))) + curr_video_grid_thw = next(iter([video_grid_thw])) + height = curr_video_grid_thw[1] // spatial_merge_size + width = curr_video_grid_thw[2] // spatial_merge_size + video_token_indices = np.arange(curr_video_grid_thw[0]).reshape(-1, 1, 1) + video_token_indices = np.broadcast_to( + video_token_indices, (video_token_indices.shape[0], height, width) + ).reshape(-1) + video_token_indices = ( + (video_token_indices + shift) + * next(iter([video_second_per_grid_t])) + * position_id_per_seconds + ) + video_data_index, audio_data_index = 0, 0 + updates = [audio_start_token_id] + while video_data_index < len(video_token_indices) and audio_data_index < len( + audio_token_indices + ): + if ( + video_token_indices[video_data_index] + <= audio_token_indices[audio_data_index] + ): + updates += [video_token_id] + video_data_index += 1 + else: + updates += [audio_token_id] + audio_data_index += 1 + if video_data_index < len(video_token_indices): + updates += [video_token_id] * (len(video_token_indices) - video_data_index) + if audio_data_index < len(audio_token_indices): + updates += [audio_token_id] * (len(audio_token_indices) - audio_data_index) + updates += [audio_end_token_id] + return updates + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, Any], + out_mm_kwargs: MultiModalKwargsItems, + ) -> Sequence[PromptUpdate]: + processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + tokenizer = self.info.get_tokenizer() + image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs) + vocab = tokenizer.get_vocab() + + audio_token = processor.audio_token + image_token = processor.image_token + video_token = processor.video_token + audio_token_id = vocab[audio_token] + image_token_id = vocab[image_token] + video_token_id = vocab[video_token] + + out_mm_data = out_mm_kwargs.get_data() + audio_feature_lengths = out_mm_data.get("audio_feature_lengths") + feature_attention_mask = out_mm_data.get("feature_attention_mask") + if audio_feature_lengths is None and feature_attention_mask is None: + audio_output_lengths = [] + elif audio_feature_lengths is not None: + _, audio_output_lens = _get_feat_extract_output_lengths( + audio_feature_lengths + ) + audio_output_lengths = audio_output_lens.tolist() + elif feature_attention_mask is not None: + assert isinstance(feature_attention_mask, torch.Tensor) + _, audio_output_lens = _get_feat_extract_output_lengths( + feature_attention_mask.sum(-1) + ) + audio_output_lengths = audio_output_lens.tolist() + + # number of audios read from video. + audio_in_video_item_idx = 0 + audio_item_idx = 0 + + def get_replacement_qwen2_audio(item_idx: int): + nonlocal audio_item_idx + item_idx += audio_in_video_item_idx + + audio_item_idx += 1 + + num_features = audio_output_lengths[item_idx] + if num_features == 0: + audios = mm_items.get_items("audio", AudioProcessorItems) + audio = audios.get(item_idx) + raise ValueError( + f"The audio {audio} (len={len(audio)}) is too short " + "to be represented inside the model" + ) + + return [audio_token_id] * num_features + + def get_replacement_qwen2_vision(item_idx: int, modality: str): + grid_thw = out_mm_data[f"{modality}_grid_thw"][item_idx] + assert isinstance(grid_thw, torch.Tensor) + merge_length = image_processor.merge_size**2 + + token_id = image_token_id if modality == "image" else video_token_id + return [token_id] * (int(grid_thw.prod()) // merge_length) + + use_audio_in_video = hf_processor_mm_kwargs.get("use_audio_in_video", False) + thinker_config = self.info.get_hf_config() + + def get_replacement_qwen2_use_audio_in_video(item_idx: int): + nonlocal audio_in_video_item_idx + audio_num_features = audio_output_lengths[audio_item_idx + item_idx] + video_grid_thw = out_mm_data["video_grid_thw"][item_idx] + + audio_in_video_item_idx += 1 + + second_per_grid_ts = hf_processor_mm_kwargs.get("second_per_grid_ts", None) + if second_per_grid_ts: + video_second_per_grid_t = second_per_grid_ts[item_idx] + else: + video_second_per_grid_t = 1.0 + + return self.get_updates_use_audio_in_video( + thinker_config=thinker_config, + audio_len=audio_num_features, + video_grid_thw=video_grid_thw, + video_second_per_grid_t=video_second_per_grid_t, + ) + + video_replacement_fn = ( + get_replacement_qwen2_use_audio_in_video + if use_audio_in_video + else partial(get_replacement_qwen2_vision, modality="video") + ) + + return [ + PromptReplacement( + modality="audio", + target=audio_token, + replacement=get_replacement_qwen2_audio, + ), + PromptReplacement( + modality="image", + target=image_token, + replacement=partial(get_replacement_qwen2_vision, modality="image"), + ), + PromptReplacement( + modality="video", + target=video_token, + replacement=video_replacement_fn, + ), + ] + + def _validate_mm_placeholders( + self, + mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]], + mm_item_counts: Mapping[str, int], + ) -> None: + BaseMultiModalProcessor[ + Qwen2_5OmniThinkerProcessingInfo + ]._validate_mm_placeholders(self, mm_placeholders, mm_item_counts) + + def _get_raw_input_ids( + self, + token_ids: list[int], + use_audio_in_video: bool = False, + ) -> list[int]: + tokenizer = self.info.get_tokenizer() + vision_bos_token = tokenizer.encode(tokenizer.vision_bos_token)[0] + vision_eos_token = tokenizer.encode(tokenizer.vision_eos_token)[0] + audio_bos_token = tokenizer.encode(tokenizer.audio_bos_token)[0] + audio_eos_token = tokenizer.encode(tokenizer.audio_eos_token)[0] + audio_token = tokenizer.encode("<|audio_pad|>")[0] + image_token = tokenizer.encode("<|image_pad|>")[0] + video_token = tokenizer.encode("<|video_pad|>")[0] + + result = token_ids[:] + if use_audio_in_video: + while True: + start = None + for i in range(len(result) - 1): + if result[i : i + 2] == [vision_bos_token, audio_bos_token]: + start = i + break + if start is not None: + end = None + for i in range(start + 2, len(result) - 1): + if result[i : i + 2] == [audio_eos_token, vision_eos_token]: + end = i + break + if end is not None: + result = ( + result[:start] + + [vision_bos_token, video_token, vision_eos_token] + + result[end + 2 :] + ) + else: + break + + for mm_token in [audio_token, image_token, video_token]: + compressed = [] + for x in result: + if x != mm_token or (not compressed or compressed[-1] != mm_token): + compressed.append(x) + result = compressed + + return result + + +class Qwen3OmniMoeConditionalGenerationMixin(Qwen2_5OmniConditionalGenerationMixin): + def _validate_and_reshape_mm_tensor( + self, mm_input: object, name: str, dim: int = 0 + ) -> torch.Tensor: + if not isinstance(mm_input, (torch.Tensor, list)): + raise ValueError(f"Incorrect type of {name}. Got type: {type(mm_input)}") + if name == "feature_attention_mask": + dim = -1 + if isinstance(mm_input, torch.Tensor): + return torch.concat(list(mm_input), dim=dim) + else: + if isinstance(mm_input[0], list): + return torch.concat( + [torch.concat(mm_input[i], dim=dim) for i in range(len(mm_input))], + dim=dim, + ) + else: + return torch.concat(mm_input, dim=dim) + + def _process_audio_input( + self, + audio_input: Qwen2AudioFeatureInputs, + audio_hashes: list[str] = None, + cached_audio_features: torch.Tensor = None, + ) -> torch.Tensor: + input_features = audio_input["input_features"] + audio_feature_lengths = audio_input["audio_feature_lengths"] + + if input_features.ndim == 3: + assert input_features.shape[0] == 1 + input_features = input_features.squeeze(0) + + if not isinstance(audio_feature_lengths, torch.Tensor): + audio_feature_lengths = torch.cat(audio_feature_lengths) + if audio_feature_lengths.ndim == 2: + audio_feature_lengths = audio_feature_lengths.reshape(-1) + + # audio_feat_lengths, audio_output_lengths = ( + # self._get_feat_extract_output_lengths(audio_feature_lengths)) + audio_feat_lengths, audio_output_lengths = _get_feat_extract_output_lengths( + audio_feature_lengths + ) + + audio_outputs = self.audio_tower( + input_features.to(self.audio_tower.dtype), + feature_lens=audio_feature_lengths, + aftercnn_lens=audio_feat_lengths, + ) + audio_features = audio_outputs.last_hidden_state + return audio_features.split(audio_output_lengths.tolist()) + + +@MULTIMODAL_REGISTRY.register_processor( + Qwen3OmniMoeThinkerMultiModalProcessor, + info=Qwen3OmniMoeThinkerProcessingInfo, + dummy_inputs=Qwen3OmniMoeThinkerDummyInputsBuilder, +) +class Qwen3OmniMoeThinkerForConditionalGeneration( + nn.Module, + SupportsMultiModal, + SupportsPP, + SupportsMRoPE, + Qwen3OmniMoeConditionalGenerationMixin, +): + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "thinker.lm_head.": "language_model.lm_head.", + "thinker.model.": "language_model.model.", + "thinker.": "", + } + ) + + @classmethod + def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: + if modality.startswith("image"): + return "<|vision_start|><|image_pad|><|vision_end|>" + if modality.startswith("video"): + return "<|vision_start|><|video_pad|><|vision_end|>" + if modality.startswith("audio"): + return "<|audio_start|><|audio_pad|><|audio_end|>" + + raise ValueError("Only image, video or audio modality is supported") + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + thinker_config: Qwen3OmniMoeThinkerConfig = ( + vllm_config.model_config.hf_config.thinker_config + ) + quant_config = vllm_config.quant_config + multimodal_config = vllm_config.model_config.multimodal_config + self.config = thinker_config + self.multimodal_config = multimodal_config + + # force "use_flash_attention_2=True" to audio tower to align + # the results. + if flash_attn is not None: + audio_config = thinker_config.audio_config + audio_config._attn_implementation_autoset = True + audio_config._attn_implementation = "flash_attention_2" + else: + logger.warning( + "flash_attn is not available, the model may not yield the " + "exactly same result as the transformers implementation " + "in the audio tower part." + ) + + self.audio_tower = Qwen3OmniMoeAudioEncoder(thinker_config.audio_config) + + self.visual = Qwen3Omni_VisionTransformer( + vision_config=thinker_config.vision_config, + norm_eps=getattr(thinker_config.text_config, "rms_norm_eps", 1e-6), + quant_config=quant_config, + prefix=maybe_prefix(prefix, "visual"), + ) + self.quant_config = quant_config + + self.language_model = Qwen3MoeLLMForCausalLM( + vllm_config=vllm_config.with_hf_config( + thinker_config.text_config, architectures=["Qwen3MoeForCausalLM"] + ), + prefix=maybe_prefix(prefix, "language_model"), + ) + + self.make_empty_intermediate_tensors = ( + self.language_model.make_empty_intermediate_tensors + ) + + self.use_deepstack = hasattr( + thinker_config.vision_config, "deepstack_visual_indexes" + ) + self.deepstack_num_level = ( + len(thinker_config.vision_config.deepstack_visual_indexes) + if self.use_deepstack + else 0 + ) + # register buffer for deepstack + self.deepstack_input_embeds = ( + [ + torch.zeros( + vllm_config.scheduler_config.max_num_batched_tokens, + thinker_config.text_config.hidden_size, + ) + for _ in range(self.deepstack_num_level) + ] + if self.use_deepstack + else None + ) + self.visual_dim = thinker_config.vision_config.out_hidden_size + self.multiscale_dim = self.visual_dim * self.deepstack_num_level + + def _get_deepstack_input_embeds(self, num_tokens: int) -> IntermediateTensors: + # get deepstack_input_embeds from buffer, and clear the buffer + return IntermediateTensors( + { + f"deepstack_input_embeds_{idx}": self.deepstack_input_embeds[idx][ + :num_tokens + ] + for idx in range(self.deepstack_num_level) + } + ) + + def _set_deepstack_input_embeds(self, deepstack_input_embeds: torch.Tensor) -> None: + # set deepstack_input_embeds to buffer + num_tokens = deepstack_input_embeds.size(1) + if num_tokens > self.deepstack_input_embeds[0].size(0): + self.deepstack_input_embeds = [ + torch.zeros( + num_tokens, + self.config.text_config.hidden_size, + device=self.deepstack_input_embeds[0].device, + dtype=self.deepstack_input_embeds[0].dtype, + ) + for _ in range(self.deepstack_num_level) + ] + for idx in range(self.deepstack_num_level): + self.deepstack_input_embeds[idx][:num_tokens].copy_( + deepstack_input_embeds[idx] + ) + + def _clear_deepstack_input_embeds(self, num_tokens: int) -> None: + # clear deepstack_input_embeds in buffer + if num_tokens > 0: + for idx in range(self.deepstack_num_level): + self.deepstack_input_embeds[idx][:num_tokens].zero_() + + def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: + mm_input_by_modality = {} + + # Preserve the order of modalities if there are multiple of them + # from the order of kwargs. + for input_key in kwargs: + if ( + input_key in ("pixel_values", "image_embeds") + and "image" not in mm_input_by_modality + ): + mm_input_by_modality["image"] = self._parse_and_validate_image_input( + **kwargs + ) + if ( + input_key in ("pixel_values_videos", "video_embeds") + and "video" not in mm_input_by_modality + ): + mm_input_by_modality["video"] = self._parse_and_validate_video_input( + **kwargs + ) + if ( + input_key in ("input_audio_features") + and "audio" not in mm_input_by_modality + ): + mm_input_by_modality["audio"] = self._parse_and_validate_audio_input( + **kwargs + ) + return mm_input_by_modality + + def get_language_model(self) -> torch.nn.Module: + return self.language_model + + def get_multimodal_embeddings( + self, **kwargs: object + ) -> Optional[MultiModalEmbeddings]: + mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs) + if not mm_input_by_modality: + return [] + + # The result multimodal_embeddings is tuple of tensors, with each + # tensor correspoending to a multimodal data item (image or video). + multimodal_embeddings: tuple[torch.Tensor, ...] = () + + # NOTE: It is important to iterate over the keys in this dictionary + # to preserve the order of the modalities. + for modality in mm_input_by_modality: + multimodal_input = mm_input_by_modality[modality] + if modality == "image": + vision_embeddings = self._process_image_input(multimodal_input) + multimodal_embeddings += vision_embeddings + if modality == "video": + video_embeddings = self._process_video_input(multimodal_input) + multimodal_embeddings += video_embeddings + if modality == "audio": + audio_embeddings = self._process_audio_input(multimodal_input) + multimodal_embeddings += audio_embeddings + return multimodal_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + *, + is_multimodal: Optional[torch.Tensor] = None, + handle_oov_mm_token: bool = False, + ) -> torch.Tensor: + inputs_embeds = self._get_text_embeddings( + input_ids, + self.language_model.get_input_embeddings, + is_multimodal=is_multimodal, + handle_oov_mm_token=handle_oov_mm_token, + ) + + if multimodal_embeddings is None or len(multimodal_embeddings) == 0: + return inputs_embeds + + deepstack_input_embeds = None + # TODO (ywang96): support overlapping modalitiy embeddings so that + # `use_audio_in_video` will work on V1. + # split the feat dim to obtain multi-scale visual feature + has_vision_embeddings = [ + embeddings.shape[-1] != self.config.text_config.hidden_size + for embeddings in multimodal_embeddings + ] + if self.visual.deepstack_visual_indexes is not None and any( + has_vision_embeddings + ): + multiscale_len = len(self.visual.deepstack_visual_indexes) + multimodal_embeddings_multiscale = [] + is_vision = torch.zeros_like(is_multimodal) + mm_positions = torch.nonzero(is_multimodal, as_tuple=True)[0] + mm_position_idx = 0 + for index, embeddings in enumerate(multimodal_embeddings): + num_tokens = embeddings.shape[0] + current_positions = mm_positions[ + mm_position_idx : mm_position_idx + num_tokens + ] + + # Vision embeddings + if embeddings.shape[-1] != self.config.text_config.hidden_size: + visual_dim = embeddings.shape[-1] // (multiscale_len + 1) + multi_dim = visual_dim * multiscale_len + embeddings_main, embeddings_multiscale = torch.split( + embeddings, [visual_dim, multi_dim], dim=-1 + ) + multimodal_embeddings[index] = embeddings_main + multimodal_embeddings_multiscale.append(embeddings_multiscale) + is_vision[current_positions] = True + + # Audio embeddings + else: + is_vision[current_positions] = False + + mm_position_idx += num_tokens + + deepstack_input_embeds = inputs_embeds.new_zeros( + inputs_embeds.size(0), multiscale_len * inputs_embeds.size(1) + ) + deepstack_input_embeds = _merge_multimodal_embeddings( + inputs_embeds=deepstack_input_embeds, + multimodal_embeddings=multimodal_embeddings_multiscale, + is_multimodal=is_vision, + ) + deepstack_input_embeds = ( + deepstack_input_embeds.view( + inputs_embeds.shape[0], multiscale_len, visual_dim + ) + .permute(1, 0, 2) + .contiguous() + ) + self._set_deepstack_input_embeds(deepstack_input_embeds) + + inputs_embeds = _merge_multimodal_embeddings( + inputs_embeds=inputs_embeds, + multimodal_embeddings=multimodal_embeddings, + is_multimodal=is_multimodal, + ) + + return inputs_embeds + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs: object, + ) -> Union[torch.Tensor, IntermediateTensors]: + if intermediate_tensors is not None: + inputs_embeds = None + + if ( + self.use_deepstack + and inputs_embeds is not None + and get_pp_group().is_first_rank + ): + deepstack_input_embeds = self._get_deepstack_input_embeds( + inputs_embeds.size(0) + ) + else: + deepstack_input_embeds = None + + hidden_states = self.language_model.model( + input_ids, + positions, + intermediate_tensors, + inputs_embeds=inputs_embeds, + # args for deepstack + deepstack_input_embeds=deepstack_input_embeds, + ) + + if inputs_embeds is not None and get_pp_group().is_first_rank: + self._clear_deepstack_input_embeds(inputs_embeds.size(0)) + + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + ) -> Optional[torch.Tensor]: + return self.language_model.compute_logits(hidden_states) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader( + self, + skip_prefixes=["talker.", "code2wav."], + ) + loaded_weights = loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) + + return loaded_weights + + @classmethod + def get_mrope_input_positions( + self, + input_tokens: list[int], + hf_config: PretrainedConfig, + image_grid_thw: Optional[Union[list[list[int]], torch.Tensor]], + video_grid_thw: Optional[Union[list[list[int]], torch.Tensor]], + second_per_grid_ts: Optional[list[float]] = None, + context_len: int = 0, + seq_len: Optional[int] = None, + audio_feature_lengths: Optional[torch.Tensor] = None, + use_audio_in_video: bool = False, + ) -> tuple[torch.Tensor, int]: + config = hf_config.thinker_config + if isinstance(image_grid_thw, list): + image_grid_thw = torch.tensor(image_grid_thw) + if isinstance(video_grid_thw, list): + video_grid_thw = torch.tensor(video_grid_thw) + input_ids = torch.tensor(input_tokens) + if input_ids is None or input_ids.ndim != 1: + raise ValueError("_omni3_get_input_positions_tensor expects 1D input_ids") + + seq_len = input_ids.shape[0] + if audio_feature_lengths is not None and not isinstance( + audio_feature_lengths, torch.Tensor + ): + audio_feature_lengths = torch.as_tensor( + audio_feature_lengths, dtype=torch.long + ) + if second_per_grid_ts is None: + if video_grid_thw is not None and video_grid_thw.numel() > 0: + second_per_grids = torch.ones( + video_grid_thw.shape[0], dtype=torch.float32 + ) + else: + second_per_grids = torch.tensor([], dtype=torch.float32) + else: + second_per_grids = torch.tensor(second_per_grid_ts, dtype=torch.float32) + + spatial_merge_size = config.vision_config.spatial_merge_size + image_token_id = config.image_token_id + video_token_id = config.video_token_id + audio_token_id = config.audio_token_id + vision_start_token_id = config.vision_start_token_id + audio_start_token_id = config.audio_start_token_id + position_id_per_seconds = config.position_id_per_seconds + + vision_start_indices = torch.argwhere( + input_ids == vision_start_token_id + ).squeeze(1) + if vision_start_indices.numel() > 0: + vision_tokens = input_ids[vision_start_indices + 1] + else: + vision_tokens = input_ids.new_empty((0,), dtype=input_ids.dtype) + audio_nums = torch.sum(input_ids == audio_start_token_id) + image_nums = (vision_tokens == image_token_id).sum() + video_nums = ( + (vision_tokens == audio_start_token_id).sum() + if use_audio_in_video + else (vision_tokens == video_token_id).sum() + ) + + llm_pos_ids_list: list[torch.Tensor] = [] + st = 0 + image_idx = 0 + video_idx = 0 + audio_idx = 0 + remain_images, remain_videos, remain_audios = image_nums, video_nums, audio_nums # noqa: E501 + multimodal_nums = ( + image_nums + audio_nums + if use_audio_in_video + else image_nums + video_nums + audio_nums + ) # noqa: E501 + + for _ in range(multimodal_nums): + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + if (image_token_id in input_tokens or video_token_id in input_tokens) and ( + remain_videos > 0 or remain_images > 0 + ): + ed_vision_start = input_tokens.index(vision_start_token_id, st) + else: + ed_vision_start = len(input_tokens) + 1 + if audio_token_id in input_tokens and remain_audios > 0: + ed_audio_start = input_tokens.index(audio_start_token_id, st) + else: + ed_audio_start = len(input_tokens) + 1 + min_ed = min(ed_vision_start, ed_audio_start) + + if min_ed == ed_audio_start: + text_len = min_ed - st + if text_len != 0: + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + llm_pos_ids_list.append( + torch.arange(text_len, dtype=torch.long) + .view(1, -1) + .expand(3, -1) + + st_idx + ) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + bos_len = 1 + llm_pos_ids_list.append( + torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + _, audio_len = _get_feat_extract_output_lengths( + audio_feature_lengths[audio_idx] + ) + llm_pos_ids = ( + torch.arange(audio_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) + llm_pos_ids_list.append(llm_pos_ids) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + eos_len = 1 + llm_pos_ids_list.append( + torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) + st += text_len + bos_len + audio_len + eos_len + audio_idx += 1 + remain_audios -= 1 + elif ( + min_ed == ed_vision_start + and input_ids[ed_vision_start + 1] == image_token_id + ): + text_len = min_ed - st + if text_len != 0: + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + llm_pos_ids_list.append( + torch.arange(text_len, dtype=torch.long) + .view(1, -1) + .expand(3, -1) + + st_idx + ) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + bos_len = 1 + llm_pos_ids_list.append( + torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + grid_t = image_grid_thw[image_idx][0] + grid_hs = image_grid_thw[:, 1] + grid_ws = image_grid_thw[:, 2] + t_index = torch.arange(grid_t) * position_id_per_seconds + llm_pos_ids = get_llm_pos_ids_for_vision( + st_idx, image_idx, spatial_merge_size, t_index, grid_hs, grid_ws + ) + image_len = image_grid_thw[image_idx].prod() // (spatial_merge_size**2) + llm_pos_ids_list.append(llm_pos_ids) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + eos_len = 1 + llm_pos_ids_list.append( + torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) + st += text_len + bos_len + image_len + eos_len + image_idx += 1 + remain_images -= 1 + elif ( + min_ed == ed_vision_start + and input_ids[ed_vision_start + 1] == video_token_id + and not use_audio_in_video + ): + text_len = min_ed - st + if text_len != 0: + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + llm_pos_ids_list.append( + torch.arange(text_len, dtype=torch.long) + .view(1, -1) + .expand(3, -1) + + st_idx + ) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + bos_len = 1 + llm_pos_ids_list.append( + torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + grid_t = video_grid_thw[video_idx][0] + grid_hs = video_grid_thw[:, 1] + grid_ws = video_grid_thw[:, 2] + t_index = ( + torch.arange(grid_t) + * float(second_per_grids[video_idx].item()) + * position_id_per_seconds + ) + llm_pos_ids = get_llm_pos_ids_for_vision( + st_idx, video_idx, spatial_merge_size, t_index, grid_hs, grid_ws + ) + video_len = video_grid_thw[video_idx].prod() // (spatial_merge_size**2) + llm_pos_ids_list.append(llm_pos_ids) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + eos_len = 1 + llm_pos_ids_list.append( + torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) + st += text_len + bos_len + video_len + eos_len + video_idx += 1 + remain_videos -= 1 + elif ( + min_ed == ed_vision_start + and ed_vision_start + 1 == ed_audio_start + and use_audio_in_video + ): + text_len = min_ed - st + if text_len != 0: + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + llm_pos_ids_list.append( + torch.arange(text_len, dtype=torch.long) + .view(1, -1) + .expand(3, -1) + + st_idx + ) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + bos_len = 1 + bos_block = ( + torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) + llm_pos_ids_list.append(bos_block) + llm_pos_ids_list.append(bos_block) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + _, audio_len = _get_feat_extract_output_lengths( + audio_feature_lengths[audio_idx] + ) + audio_llm_pos_ids = ( + torch.arange(audio_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) + grid_t = video_grid_thw[video_idx][0] + grid_hs = video_grid_thw[:, 1] + grid_ws = video_grid_thw[:, 2] + t_index = ( + torch.arange(grid_t) + * float(second_per_grids[video_idx].item()) + * position_id_per_seconds + ) + video_llm_pos_ids = get_llm_pos_ids_for_vision( + st_idx, video_idx, spatial_merge_size, t_index, grid_hs, grid_ws + ) + video_data_index, audio_data_index = 0, 0 + while ( + video_data_index < video_llm_pos_ids.shape[-1] + and audio_data_index < audio_llm_pos_ids.shape[-1] + ): + if ( + video_llm_pos_ids[0][video_data_index] + <= audio_llm_pos_ids[0][audio_data_index] + ): + llm_pos_ids_list.append( + video_llm_pos_ids[ + :, video_data_index : video_data_index + 1 + ] + ) + video_data_index += 1 + else: + llm_pos_ids_list.append( + audio_llm_pos_ids[ + :, audio_data_index : audio_data_index + 1 + ] + ) + audio_data_index += 1 + if video_data_index < video_llm_pos_ids.shape[-1]: + llm_pos_ids_list.append( + video_llm_pos_ids[ + :, video_data_index : video_llm_pos_ids.shape[-1] + ] + ) + if audio_data_index < audio_llm_pos_ids.shape[-1]: + llm_pos_ids_list.append( + audio_llm_pos_ids[ + :, audio_data_index : audio_llm_pos_ids.shape[-1] + ] + ) + video_len = video_grid_thw[video_idx].prod() // (spatial_merge_size**2) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + eos_len = 1 + eos_block = ( + torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) + llm_pos_ids_list.append(eos_block) + llm_pos_ids_list.append(eos_block) + st += text_len + bos_len * 2 + audio_len + video_len + eos_len * 2 # noqa: E501 + audio_idx += 1 + video_idx += 1 + remain_videos -= 1 + remain_audios -= 1 + + if st < len(input_tokens): + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + text_len = len(input_tokens) - st + llm_pos_ids_list.append( + torch.arange(text_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) + + llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) + if llm_positions.shape[1] != seq_len: + raise RuntimeError("Position ids length mismatch with input ids length") + + mrope_position_delta = llm_positions.max() + 1 - seq_len + return llm_positions, mrope_position_delta + + def _get_text_embeddings( + self, + input_ids: torch.Tensor, + get_input_embeddings: Callable[[torch.Tensor], torch.Tensor], + *, + is_multimodal: Optional[torch.Tensor], + handle_oov_mm_token: bool, + ) -> torch.Tensor: + if handle_oov_mm_token and is_multimodal is not None: + is_text = ~is_multimodal + text_embeds = get_input_embeddings(input_ids[is_text]) + + return torch.empty( + (input_ids.shape[0], text_embeds.shape[1]), + dtype=text_embeds.dtype, + device=text_embeds.device, + ).masked_scatter_(is_text.unsqueeze_(-1), text_embeds) + + return get_input_embeddings(input_ids) diff --git a/vllm_kunlun/models/qwen3_vl.py b/vllm_kunlun/models/qwen3_vl.py new file mode 100644 index 0000000..f46b97c --- /dev/null +++ b/vllm_kunlun/models/qwen3_vl.py @@ -0,0 +1,1636 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright 2025 The vLLM team. +# Copyright 2025 The Qwen Team. +# Copyright 2025 The HuggingFace Inc. team. +# All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only Qwen3VL model compatible with HuggingFace weights.""" +from collections.abc import Iterable, Mapping, Sequence +from functools import partial +from typing import Any, Callable, Optional, Union + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers import BatchFeature +from transformers.models.qwen2_vl import Qwen2VLImageProcessorFast +from transformers.models.qwen2_vl.image_processing_qwen2_vl import ( + smart_resize as image_smart_resize) +from transformers.models.qwen3_vl import (Qwen3VLProcessor, + Qwen3VLVideoProcessor) +from transformers.models.qwen3_vl.configuration_qwen3_vl import ( + Qwen3VLConfig, Qwen3VLVisionConfig) +from transformers.models.qwen3_vl.video_processing_qwen3_vl import ( + smart_resize as video_smart_resize) +from transformers.video_utils import VideoMetadata + +from vllm.attention.layer import check_upstream_fa_availability +from vllm.compilation.decorators import support_torch_compile +from vllm.config import VllmConfig +from vllm.distributed import get_pp_group +from vllm.logger import init_logger +from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY + +from vllm_kunlun.ops.linear import (ColumnParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.module_mapping import MultiModelKeys +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalKwargsItem, + MultiModalKwargsItems, VideoItem) +from vllm.multimodal.parse import (ImageSize, MultiModalDataItems, + MultiModalDataParser) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + PromptReplacement, PromptUpdate, + PromptUpdateDetails) +from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.platforms import _Backend +from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import uses_mrope +from vllm.utils import is_list_of + +from vllm.model_executor.models.interfaces import (MultiModalEmbeddings, SupportsLoRA, + SupportsMultiModal, SupportsPP) +from .qwen2_5_vl import (Qwen2_5_VisionAttention, + Qwen2_5_VisionRotaryEmbedding, + Qwen2_5_VLImageEmbeddingInputs, Qwen2_5_VLImageInputs, + Qwen2_5_VLImagePixelInputs, + Qwen2_5_VLVideoEmbeddingInputs, Qwen2_5_VLVideoInputs, + Qwen2_5_VLVideoPixelInputs) +from .qwen2_vl import Qwen2VLProcessingInfo +from .qwen3 import Qwen3ForCausalLM, Qwen3Model +from vllm.model_executor.models.utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper, + maybe_prefix, merge_multimodal_embeddings) +from vllm.model_executor.models.vision import get_vit_attn_backend, run_dp_sharded_mrope_vision_model +import xtorch_ops +from einops import repeat + +logger = init_logger(__name__) + +# Official recommended max pixels is 24576 * 32 * 32 +_MAX_FRAMES_PER_VIDEO = 24576 + + +class Qwen3_VisionPatchEmbed(nn.Module): + + def __init__( + self, + patch_size: int = 14, + temporal_patch_size: int = 2, + in_channels: int = 3, + hidden_size: int = 1152, + ) -> None: + super().__init__() + self.patch_size = patch_size + self.temporal_patch_size = temporal_patch_size + self.hidden_size = hidden_size + + kernel_size = (temporal_patch_size, patch_size, patch_size) + self.proj = nn.Conv3d(in_channels, + hidden_size, + kernel_size=kernel_size, + stride=kernel_size, + bias=True) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + L, C = x.shape + x = x.view(L, -1, self.temporal_patch_size, self.patch_size, + self.patch_size) + x = self.proj(x).view(L, self.hidden_size) + return x + + +class Qwen3_VisionMLP(nn.Module): + + def __init__(self, + in_features: int, + hidden_features: int, + bias: bool = False, + act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + use_data_parallel: bool = False): + super().__init__() + self.linear_fc1 = ColumnParallelLinear(in_features, + hidden_features, + bias=bias, + quant_config=quant_config, + return_bias=False, + prefix=f"{prefix}.linear_fc1", + disable_tp=use_data_parallel) + self.linear_fc2 = RowParallelLinear(hidden_features, + in_features, + bias=bias, + quant_config=quant_config, + return_bias=False, + prefix=f"{prefix}.linear_fc2", + disable_tp=use_data_parallel) + self.act_fn = act_fn + + def forward(self, x: torch.Tensor): + mlp_output = self.linear_fc2(self.act_fn(self.linear_fc1(x))) + return mlp_output + + +class Qwen3_VisionBlock(nn.Module): + + def __init__( + self, + dim: int, + num_heads: int, + mlp_hidden_dim: int, + act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu, + norm_layer: Optional[Callable[[int], nn.Module]] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + use_data_parallel: bool = False, + attn_backend: _Backend = _Backend.TORCH_SDPA, + use_upstream_fa: bool = False, + ) -> None: + super().__init__() + if norm_layer is None: + norm_layer = partial(nn.LayerNorm, eps=1e-6) + self.norm1 = norm_layer(dim) + self.norm2 = norm_layer(dim) + self.attn = Qwen2_5_VisionAttention( + embed_dim=dim, + num_heads=num_heads, + projection_size=dim, + quant_config=quant_config, + prefix=f"{prefix}.attn", + use_data_parallel=use_data_parallel, + attn_backend=attn_backend, + use_upstream_fa=use_upstream_fa) + self.mlp = Qwen3_VisionMLP(dim, + mlp_hidden_dim, + act_fn=act_fn, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + use_data_parallel=use_data_parallel) + + def forward( + self, + x: torch.Tensor, + cu_seqlens: torch.Tensor, + rotary_pos_emb: torch.Tensor, + max_seqlen: Optional[int] = None, # Only used for Flash Attention + seqlens: Optional[list[int]] = None, # Only used for xFormers + ) -> torch.Tensor: + x = x + self.attn(self.norm1(x), + cu_seqlens=cu_seqlens, + rotary_pos_emb=rotary_pos_emb, + max_seqlen=max_seqlen, + seqlens=seqlens) + + x = x + self.mlp(self.norm2(x)) + return x + + +class Qwen3_VisionPatchMerger(nn.Module): + + def __init__( + self, + d_model: int, + context_dim: int, + norm_layer: Optional[Callable[[int], nn.Module]] = None, + spatial_merge_size: int = 2, + use_postshuffle_norm: bool = False, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + use_data_parallel: bool = False, + ) -> None: + super().__init__() + self.hidden_size = context_dim * (spatial_merge_size**2) + + self.use_postshuffle_norm = use_postshuffle_norm + if self.use_postshuffle_norm: + context_dim = self.hidden_size + + if norm_layer is None: + norm_layer = partial(nn.LayerNorm, eps=1e-6) + self.norm = norm_layer(context_dim) + self.linear_fc1 = ColumnParallelLinear(self.hidden_size, + self.hidden_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.linear_fc1", + disable_tp=use_data_parallel) + self.act_fn = nn.GELU() + self.linear_fc2 = RowParallelLinear(self.hidden_size, + d_model, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.linear_fc2", + disable_tp=use_data_parallel) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if self.use_postshuffle_norm: + x = self.norm(x.view(-1, self.hidden_size)) + else: + x = self.norm(x).view(-1, self.hidden_size) + + x_parallel, _ = self.linear_fc1(x) + x_parallel = self.act_fn(x_parallel) + out, _ = self.linear_fc2(x_parallel) + return out + + +class Qwen3_VisionTransformer(nn.Module): + + def __init__( + self, + vision_config: Qwen3VLVisionConfig, + norm_eps: float = 1e-6, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + use_data_parallel: bool = False, + ) -> None: + super().__init__() + self.hidden_size = vision_config.hidden_size + self.num_heads = vision_config.num_heads + self.num_position_embeddings = vision_config.num_position_embeddings + self.patch_size = vision_config.patch_size + self.spatial_merge_size = vision_config.spatial_merge_size + self.spatial_merge_unit = self.spatial_merge_size**2 + self.temporal_patch_size = vision_config.temporal_patch_size + self.deepstack_visual_indexes = vision_config.deepstack_visual_indexes + self.use_data_parallel = use_data_parallel + self.num_grid_per_side = int(self.num_position_embeddings**0.5) + + # NOTE: This is used for creating empty tensor for all_gather for + # DP ViT. Here out_hidden_size is enlarged due to deepstack + self.out_hidden_size = (vision_config.out_hidden_size * + (1 + len(self.deepstack_visual_indexes))) + + self.patch_embed = Qwen3_VisionPatchEmbed( + patch_size=self.patch_size, + temporal_patch_size=self.temporal_patch_size, + in_channels=vision_config.in_channels, + hidden_size=self.hidden_size, + ) + + self.pos_embed = nn.Embedding(self.num_position_embeddings, + self.hidden_size) + + norm_layer = partial(nn.LayerNorm, eps=norm_eps) + head_dim = self.hidden_size // self.num_heads + self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2) + + self.merger = Qwen3_VisionPatchMerger( + d_model=vision_config.out_hidden_size, + context_dim=self.hidden_size, + norm_layer=norm_layer, + spatial_merge_size=self.spatial_merge_size, + quant_config=quant_config, + prefix=f"{prefix}.merger", + use_data_parallel=use_data_parallel, + ) + + self.deepstack_merger_list = nn.ModuleList([ + Qwen3_VisionPatchMerger( + d_model=vision_config.out_hidden_size, + context_dim=self.hidden_size, + spatial_merge_size=self.spatial_merge_size, + use_postshuffle_norm=True, + norm_layer=norm_layer, + quant_config=quant_config, + prefix=f"{prefix}.deepstack_merger_list.{layer_idx}", + use_data_parallel=use_data_parallel) + for layer_idx in range(len(self.deepstack_visual_indexes)) + ]) + + self.attn_backend = get_vit_attn_backend( + head_size=head_dim, dtype=torch.get_default_dtype()) + self.attn_backend = _Backend.FLASH_ATTN + use_upstream_fa = False + if self.attn_backend != _Backend.FLASH_ATTN and \ + check_upstream_fa_availability( + torch.get_default_dtype()): + self.attn_backend = _Backend.FLASH_ATTN + use_upstream_fa = True + + if self.attn_backend not in { + _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS, + _Backend.ROCM_AITER_FA + }: + raise RuntimeError( + f"Qwen3-VL does not support {self.attn_backend} backend now.") + + self.blocks = nn.ModuleList([ + Qwen3_VisionBlock( + dim=self.hidden_size, + num_heads=self.num_heads, + mlp_hidden_dim=vision_config.intermediate_size, + act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act], + norm_layer=norm_layer, + quant_config=quant_config, + prefix=f"{prefix}.blocks.{layer_idx}", + use_data_parallel=use_data_parallel, + attn_backend=self.attn_backend, + use_upstream_fa=use_upstream_fa) + for layer_idx in range(vision_config.depth) + ]) + + @property + def dtype(self) -> torch.dtype: + return self.patch_embed.proj.weight.dtype + + @property + def device(self) -> torch.device: + return self.patch_embed.proj.weight.device + + def rot_pos_emb(self, grid_thw): + pos_ids = [] + # Support both Tensor and list inputs for DP path + if isinstance(grid_thw, list): + grid_list = grid_thw + max_grid_size = max(max(h, w) for _, h, w in grid_list) + else: + grid_list = grid_thw.tolist() + max_grid_size = int(grid_thw[:, 1:].max().item()) + for t, h, w in grid_list: + hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) + hpos_ids = hpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ) + hpos_ids = hpos_ids.permute(0, 2, 1, 3) + hpos_ids = hpos_ids.flatten() + wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) + wpos_ids = wpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ) + wpos_ids = wpos_ids.permute(0, 2, 1, 3) + wpos_ids = wpos_ids.flatten() + pos_ids.append( + torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) + pos_ids = torch.cat(pos_ids, dim=0) + rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) + + rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) + + return rotary_pos_emb + + def fast_pos_embed_interpolate(self, + grid_thw: list[list[int]]) -> torch.Tensor: + + num_grid_per_side = self.num_grid_per_side + m_size = self.spatial_merge_size + hidden_dim = self.pos_embed.embedding_dim + outputs = [] + for t, h, w in grid_thw: + h_idxs = torch.linspace(0, + num_grid_per_side - 1, + h, + dtype=torch.float32, + device=self.device) + + w_idxs = torch.linspace(0, + num_grid_per_side - 1, + w, + dtype=torch.float32, + device=self.device) + + h_floor = h_idxs.to(torch.long) + w_floor = w_idxs.to(torch.long) + h_ceil = torch.clamp(h_floor + 1, max=num_grid_per_side - 1) + w_ceil = torch.clamp(w_floor + 1, max=num_grid_per_side - 1) + + dh = h_idxs - h_floor + dw = w_idxs - w_floor + + # Create meshgrid view for all h, w vars + dh_grid, dw_grid = torch.meshgrid(dh, dw, indexing='ij') + h_floor_grid, w_floor_grid = torch.meshgrid(h_floor, + w_floor, + indexing='ij') + h_ceil_grid, w_ceil_grid = torch.meshgrid(h_ceil, + w_ceil, + indexing='ij') + h_floor_grid_idx = h_floor_grid * num_grid_per_side + h_ceil_grid_idx = h_ceil_grid * num_grid_per_side + + # original computation of weights + # w00 = (1 - dh_grid) * (1 - dw_grid) + # w01 = (1 - dh_grid) * dw_grid + # w10 = dh_grid * (1 - dw_grid) + # w11 = dh_grid * dw_grid + # we reuse w11 here to avoid duplicate + # dh_grid * dw_grid computation + w11 = dh_grid * dw_grid + w10 = dh_grid - w11 + w01 = dw_grid - w11 + w00 = 1 - dh_grid - dw_grid + w11 + + idx00 = h_floor_grid_idx + w_floor_grid + idx01 = h_floor_grid_idx + w_ceil_grid + idx10 = h_ceil_grid_idx + w_floor_grid + idx11 = h_ceil_grid_idx + w_ceil_grid + + indices = torch.stack([idx00, idx01, idx10, idx11], + dim=0).reshape(4, -1) + weights = torch.stack([w00, w01, w10, w11], + dim=0).reshape(4, -1, 1) + weights = weights.to(dtype=self.dtype, device=self.device) + + embeds = self.pos_embed(indices) + weighted_embeds = embeds * weights + p0, p1, p2, p3 = weighted_embeds.unbind(dim=0) + combined = p0 + p1 + p2 + p3 + + combined = combined.view(h * w, hidden_dim) + repeated = combined.unsqueeze(0).expand(t, -1, -1).contiguous() + repeated = repeated.view(t, h // m_size, m_size, w // m_size, + m_size, hidden_dim) + repeated = repeated.permute(0, 1, 3, 2, 4, + 5).reshape(-1, hidden_dim) + outputs.append(repeated) + + return torch.cat(outputs, dim=0) + + def compute_attn_mask_seqlen( + self, + cu_seqlens: torch.Tensor, + ) -> tuple[Optional[int], Optional[list[int]]]: + max_seqlen, seqlens = None, None + if self.attn_backend == _Backend.FLASH_ATTN: + max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() + elif self.attn_backend == _Backend.XFORMERS: + seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() + return max_seqlen, seqlens + + def forward( + self, + x: torch.Tensor, + grid_thw: list[list[int]], + ) -> torch.Tensor: + hidden_states = x.to(device=self.device, dtype=self.dtype) + hidden_states = self.patch_embed(hidden_states) + + pos_embeds = self.fast_pos_embed_interpolate(grid_thw) + hidden_states = hidden_states + pos_embeds + rotary_pos_emb = self.rot_pos_emb(grid_thw) + + grid_thw_tensor = torch.tensor(grid_thw, + device=self.device, + dtype=torch.int32) + + cu_seqlens = torch.repeat_interleave( + grid_thw_tensor[:, 1] * grid_thw_tensor[:, 2], + grid_thw_tensor[:, 0]).cumsum( + dim=0, + dtype=grid_thw_tensor.dtype + if torch.jit.is_tracing() else torch.int32, + ) + cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0) + + hidden_states = hidden_states.unsqueeze(1) + rotary_pos_emb = rotary_pos_emb.to(hidden_states.device) + max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens) + + rotary_pos_emb_cos = rotary_pos_emb.cos() + rotary_pos_emb_sin = rotary_pos_emb.sin() + interleaved = False + rotary_pos_emb_cos = repeat( + rotary_pos_emb_cos, + "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)") # shape: [seq_len, 1, head_dim] + rotary_pos_emb_sin = repeat( + rotary_pos_emb_sin, + "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)") # shape: [seq_len, 1, head_dim] + rotary_pos_emb_cos_sin_cache = torch.cat([rotary_pos_emb_cos, rotary_pos_emb_sin], dim=1) # shape: [seq_len, 2, head_dim] + + deepstack_feature_lists = [] + for layer_num, blk in enumerate(self.blocks): + hidden_states = blk(hidden_states, + cu_seqlens=cu_seqlens, + rotary_pos_emb=rotary_pos_emb_cos_sin_cache, + max_seqlen=max_seqlen, + seqlens=seqlens) + if layer_num in self.deepstack_visual_indexes: + deepstack_merger_idx = self.deepstack_visual_indexes.index( + layer_num) + deepstack_feature = self.deepstack_merger_list[ + deepstack_merger_idx](hidden_states) + deepstack_feature_lists.append(deepstack_feature) + hidden_states = self.merger(hidden_states) + hidden_states = torch.cat( + [hidden_states] + deepstack_feature_lists, + dim=1) # [seq_len, hidden_size * (1 + depth_of_deepstack)] + return hidden_states + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("attn.qkv.", "attn.q.", "q"), + ("attn.qkv.", "attn.k.", "k"), + ("attn.qkv.", "attn.v.", "v"), + ] + params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: set[str] = set() + + for name, loaded_weight in weights: + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo): + + def get_hf_config(self): + return self.ctx.get_hf_config(Qwen3VLConfig) + + def get_hf_processor(self, **kwargs: object) -> Qwen3VLProcessor: + return self.ctx.get_hf_processor( + Qwen3VLProcessor, + use_fast=kwargs.pop("use_fast", True), + **kwargs, + ) + + def get_tokenizer(self): + return self.ctx.tokenizer + + def get_image_processor(self, + **kwargs: object) -> Qwen2VLImageProcessorFast: + return self.get_hf_processor(**kwargs).image_processor + + def get_video_processor(self, **kwargs: object) -> Qwen3VLVideoProcessor: + return self.get_hf_processor(**kwargs).video_processor + + def _get_vision_info( + self, + *, + image_width: int, + image_height: int, + num_frames: int = 2, + do_resize: bool = True, + image_processor: Optional[Union[Qwen2VLImageProcessorFast, + Qwen3VLVideoProcessor]], + ) -> tuple[ImageSize, int]: + if image_processor is None and num_frames > 1: + image_processor = self.get_video_processor() + elif image_processor is None: + image_processor = self.get_image_processor() + + is_video = isinstance(image_processor, Qwen3VLVideoProcessor) + + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + patch_size = vision_config.patch_size + merge_size = vision_config.spatial_merge_size + temporal_patch_size = vision_config.temporal_patch_size + + if do_resize: + if is_video: + smart_resize = video_smart_resize + extra_kwargs = { + "num_frames": num_frames, + "temporal_factor": temporal_patch_size + } + else: + smart_resize = image_smart_resize + extra_kwargs = {} + resized_height, resized_width = smart_resize( + height=image_height, + width=image_width, + factor=patch_size * merge_size, + min_pixels=image_processor.size["shortest_edge"], + max_pixels=image_processor.size["longest_edge"], + **extra_kwargs, + ) + preprocessed_size = ImageSize(width=resized_width, + height=resized_height) + else: + preprocessed_size = ImageSize(width=image_width, + height=image_height) + + padded_num_frames = num_frames + num_frames % temporal_patch_size + + grid_t = max(padded_num_frames // temporal_patch_size, 1) + grid_h = preprocessed_size.height // patch_size + grid_w = preprocessed_size.width // patch_size + + num_patches = grid_t * grid_h * grid_w + num_vision_tokens = num_patches // (merge_size**2) + + return preprocessed_size, num_vision_tokens + + def _get_max_video_frames(self, + max_tokens: int, + start_num_frames: int = 2) -> int: + return super()._get_max_video_frames(max_tokens, + start_num_frames=start_num_frames) + + def get_num_frames_with_most_features( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> int: + return super().get_num_frames_with_most_features( + seq_len, mm_counts, max_frames_per_video=_MAX_FRAMES_PER_VIDEO) + + def get_max_video_tokens( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> int: + target_width, target_height = self.get_image_size_with_most_features() + video_soft_tokens = self.get_num_video_tokens( + image_width=target_width, + image_height=target_height, + num_frames=self.get_num_frames_with_most_features( + seq_len, mm_counts), + image_processor=None, + ) + + # NOTE: By default in Qwen3-VL, one video token is converted to + # "<{timestamp} seconds>" (on average 9.5 tokens) + vision_start_token + video_token + vision_end_token # noqa: E501 + formatted_video_soft_tokens = video_soft_tokens * 12.5 + return int(formatted_video_soft_tokens) + + def _calculate_timestamps(self, indices: list[int] | torch.Tensor, + video_fps: float, merge_size: int): + if not isinstance(indices, list): + indices = indices.tolist() + if len(indices) % merge_size != 0: + # don't update metadata's frames_indices directly + indices = indices + [indices[-1] + ] * (merge_size - len(indices) % merge_size) + timestamps = [idx / video_fps for idx in indices] + timestamps = [(timestamps[i] + timestamps[i + merge_size - 1]) / 2 + for i in range(0, len(timestamps), merge_size)] + return timestamps + + def _get_video_second_idx( + self, + metadata: dict[str, Any], + out_item: MultiModalKwargsItem, + do_sample_frames: Optional[bool] = None, + sampled_fps: Optional[float] = None) -> list[int]: + video_processor = self.get_video_processor() + merge_size = video_processor.merge_size + indices = metadata["frames_indices"] + + # metadata["fps"] refers to the true fps of the input video. + video_fps = metadata["fps"] + if do_sample_frames is None: + do_sample_frames = metadata.get("do_sample_frames", False) + + # If video frames are sampled in HF processor (instead of vLLM + # video loader), we need to re-calculate the indices from original + # metadata. + if do_sample_frames: + # here video_fps is the fps of the sampled video, and + # metadata["fps"] refers to the fps of the original video. + video_fps = sampled_fps if sampled_fps else video_processor.fps + total_num_frames = metadata["total_num_frames"] + num_frames = int(total_num_frames / metadata["fps"] * video_fps) + num_frames = min( + min(max(num_frames, video_processor.min_frames), + video_processor.max_frames), total_num_frames) + indices = np.linspace(0, total_num_frames - 1, + num_frames).round().astype(int).tolist() + timestamps = self._calculate_timestamps(indices, video_fps, merge_size) + return timestamps + + +class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]): + + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_images = mm_counts.get("image", 0) + num_videos = mm_counts.get("video", 0) + + image_token = "<|vision_start|><|image_pad|><|vision_end|>" + video_token = "<|vision_start|><|video_pad|><|vision_end|>" + + return image_token * num_images + video_token * num_videos + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalDataDict: + num_images = mm_counts.get("image", 0) + num_videos = mm_counts.get("video", 0) + + target_width, target_height = ( + self.info.get_image_size_with_most_features()) + target_num_frames = self.info.get_num_frames_with_most_features( + seq_len, mm_counts) + target_video_size, _ = self.info._get_vision_info( + image_width=target_width, + image_height=target_height, + num_frames=target_num_frames, + image_processor=self.info.get_video_processor(), + ) + return { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images), + "video": + self._get_dummy_videos( + width=target_video_size.width, + height=target_video_size.height, + num_frames=target_num_frames, + num_videos=num_videos, + ), + } + + def _get_dummy_videos( + self, + *, + width: int, + height: int, + num_frames: int, + num_videos: int, + ) -> list[VideoItem]: + min_width = 64 + min_height = 64 + min_frames = 2 + + width = max(min_width, width // 16) + height = max(min_height, height // 16) + num_frames = max(min_frames, min(num_frames, 8)) + + video = np.full((num_frames, width, height, 3), 255, dtype=np.uint8) + video_items = [] + for i in range(num_videos): + video_metadata = { + "fps": 2.0, + "duration": num_frames / 2.0, + "total_num_frames": num_frames, + "frames_indices": [i for i in range(num_frames)], + "video_backend": "opencv", + "do_sample_frames": False, + } + video_item = (video.copy(), video_metadata) + video_items.append(video_item) + return video_items + + def get_dummy_processor_inputs(self, seq_len, mm_counts): + processor_inputs = super().get_dummy_processor_inputs( + seq_len, mm_counts) + # HACK(Isotr0py): We set do_resize to False here to reuse Qwen2-VL's + # profiling logic, which will be problematic for configurable mm + # profiling. + # TODO(Isotr0py): Switch to the implementation in + # https://github.com/vllm-project/vllm/pull/25557 + # after supporting configurable mm profiling. + processor_inputs.hf_processor_mm_kwargs = {"do_resize": False} + return processor_inputs + + +class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo] + ): + + def _get_data_parser(self) -> MultiModalDataParser: + return MultiModalDataParser(video_needs_metadata=True) + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + tok_kwargs: Mapping[str, object], + ) -> BatchFeature: + mm_data = dict(mm_data) + processor = self.info.get_hf_processor(**mm_kwargs) + + # Separate video processing from image processing. Because the videos + # are processed into serval image patches + if ("videos" in mm_data and isinstance(mm_data["videos"], list) + and len(mm_data["videos"]) > 0): + video_grid_thw_lst = [] + pixel_values_videos_lst = [] + + for item_idx, item in enumerate(mm_data.pop("videos", [])): + video_array, metadata = item + + # NOTE: @JJJYmmm new attr metadata.frames_indices indicates + # the sampled frames indices of pre-sampled videos, which is + # used to calculate the timestamps. Make sure that + # do_sample_frames in mm_kwargs is false for presampled videos. + + # NOTE: a copy of is created to update do_sample_frames, + # otherwise mm_hash for the object will be incorrect. + video_mm_kwargs = dict(**mm_kwargs) + if "do_sample_frames" not in video_mm_kwargs: + # qwen_vl_utils already has "do_sample_frames" in + # mm_kwargs, don't overwrite it. + video_mm_kwargs["do_sample_frames"] = metadata.get( + "do_sample_frames", False) + + metadata = VideoMetadata(**{ + k: metadata[k] + for k in metadata if k != "do_sample_frames" + }) + + video_mm_data = dict() + video_mm_data["videos"] = [[video_array]] + video_mm_data["video_metadata"] = [[metadata]] + + video_outputs = super()._call_hf_processor( + prompt="<|vision_start|><|video_pad|><|vision_end|>", + mm_data=video_mm_data, + mm_kwargs=video_mm_kwargs, + tok_kwargs=tok_kwargs, + ) + input_ids = video_outputs.pop("input_ids") + video_placeholder = processor.tokenizer.batch_decode( + input_ids)[0] + prompt = prompt.replace( + "<|vision_start|><|video_pad|><|vision_end|>", + video_placeholder, + 1, + ) + + video_grid_thw_lst.append(video_outputs["video_grid_thw"]) + pixel_values_videos_lst.append( + video_outputs["pixel_values_videos"]) + video_outputs = dict( + pixel_values_videos=torch.cat(pixel_values_videos_lst), + video_grid_thw=torch.cat(video_grid_thw_lst), + ) + else: + video_outputs = dict() + + processed_outputs = super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + tok_kwargs=tok_kwargs, + ) + combined_outputs = dict( + processed_outputs, + **video_outputs, + ) + return BatchFeature(combined_outputs) + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3))) + image_grid_sizes = image_grid_thw.prod(-1) + + video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3))) + video_grid_sizes = video_grid_thw.prod(-1) + + return dict( + pixel_values=MultiModalFieldConfig.flat_from_sizes( + "image", image_grid_sizes), + image_embeds=MultiModalFieldConfig.flat_from_sizes( + "image", image_grid_sizes), + image_grid_thw=MultiModalFieldConfig.batched("image"), + pixel_values_videos=MultiModalFieldConfig.flat_from_sizes( + "video", video_grid_sizes), + video_embeds=MultiModalFieldConfig.flat_from_sizes( + "video", video_grid_sizes), + video_grid_thw=MultiModalFieldConfig.batched("video"), + ) + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, Any], + out_mm_kwargs: MultiModalKwargsItems, + ) -> Sequence[PromptUpdate]: + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + image_processor = self.info.get_image_processor( + **hf_processor_mm_kwargs) + tokenizer = self.info.get_tokenizer() + hf_config = self.info.get_hf_config() + + video_token_id = hf_config.video_token_id + vision_start_token_id = hf_config.vision_start_token_id + vision_end_token_id = hf_config.vision_end_token_id + + merge_length = image_processor.merge_size**2 + + def get_image_replacement_qwen3vl(item_idx: int): + out_item = out_mm_kwargs["image"][item_idx] + grid_thw = out_item["image_grid_thw"].data + assert isinstance(grid_thw, torch.Tensor) + + num_tokens = int(grid_thw.prod()) // merge_length + return [hf_processor.image_token_id] * num_tokens + + def get_video_replacement_qwen3vl(item_idx: int): + out_item = out_mm_kwargs["video"][item_idx] + grid_thw = out_item["video_grid_thw"].data + assert isinstance(grid_thw, torch.Tensor) + + video, metadata = mm_items["video"][item_idx] + do_sample_frames = hf_processor_mm_kwargs.get("do_sample_frames") + sampled_fps = hf_processor_mm_kwargs.get("fps") + if is_list_of(sampled_fps, float): + sampled_fps = sampled_fps[item_idx] + timestamps = self.info._get_video_second_idx( + metadata, out_item, do_sample_frames, sampled_fps) + + assert len(timestamps) == grid_thw[0], ( + f"The timestamps length({len(timestamps)}) should be equal " + f"video length ({grid_thw[0]}).") + + frames_idx_token = [ + tokenizer.encode(f"<{curr_time:.1f} seconds>", + add_special_tokens=False) + for curr_time in timestamps + ] + num_tokens_per_frame = int(grid_thw[1:].prod()) // merge_length + placeholder = [] + for frame_idx in frames_idx_token: + placeholder.extend(frame_idx) + placeholder.extend([vision_start_token_id] + + [video_token_id] * num_tokens_per_frame + + [vision_end_token_id]) + return PromptUpdateDetails.select_token_id(placeholder, + video_token_id) + + return [ + PromptReplacement( + modality="image", + target=hf_processor.image_token, + replacement=get_image_replacement_qwen3vl, + ), + + # NOTE: We match string on purpose since searching sequence of + # token ids takes more time. + PromptReplacement( + modality="video", + target="<|vision_start|><|video_pad|><|vision_end|>", + replacement=get_video_replacement_qwen3vl, + ), + ] + + +@support_torch_compile( + dynamic_arg_dims={ + "input_ids": 0, + # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl, + # otherwise (seq_len, ). + "positions": -1, + "intermediate_tensors": 0, + "inputs_embeds": 0, + # the same shape as input_embeds + "deepstack_input_embeds": 0 + }) +class Qwen3LLMModel(Qwen3Model): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + if not get_pp_group().is_first_rank: + assert self.start_layer >= len( + vllm_config.model_config.hf_config.vision_config. + deepstack_visual_indexes), ( + "start_layer should be greater than or equal to " + "len(deepstack_visual_indexes)") + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + # args for deepstack + deepstack_input_embeds: Optional[IntermediateTensors] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + for layer_idx, layer in enumerate( + self.layers[self.start_layer:self.end_layer]): + layer_idx = layer_idx + self.start_layer + + hidden_states, residual = layer( + positions, + hidden_states, + residual, + ) + + if deepstack_input_embeds is not None and \ + layer_idx in range(0, len(deepstack_input_embeds)): + hidden_states = hidden_states + deepstack_input_embeds[ + f"deepstack_input_embeds_{layer_idx}"] + + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual + }) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + +class Qwen3LLMForCausalLM(Qwen3ForCausalLM): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super(Qwen3ForCausalLM, self).__init__() + config = vllm_config.model_config.hf_config.text_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + + self.config = config + self.lora_config = lora_config + + self.quant_config = quant_config + self.model = Qwen3LLMModel(vllm_config=vllm_config, prefix=prefix) + + if get_pp_group().is_last_rank: + if config.tie_word_embeddings: + self.lm_head = self.model.embed_tokens + else: + self.lm_head = ParallelLMHead(config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix="lm_head") + else: + self.lm_head = PPMissingLayer() + + self.logits_processor = LogitsProcessor(config.vocab_size) + + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + + +@MULTIMODAL_REGISTRY.register_processor(Qwen3VLMultiModalProcessor, + info=Qwen3VLProcessingInfo, + dummy_inputs=Qwen3VLDummyInputsBuilder) +class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, + SupportsLoRA, SupportsPP): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + supports_encoder_tp_data = True + + # To ensure correct weight loading and mapping. + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "model.visual.": "visual.", + "lm_head.": "language_model.lm_head.", + "model.language_model.": "language_model.model.", + }) + + @classmethod + def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: + if modality.startswith("image"): + return "<|vision_start|><|image_pad|><|vision_end|>" + if modality.startswith("video"): + return "<|vision_start|><|video_pad|><|vision_end|>" + + raise ValueError("Only image or video modality is supported") + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"): + super().__init__() + config: Qwen3VLConfig = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + multimodal_config = vllm_config.model_config.multimodal_config + + self.config = config + self.multimodal_config = multimodal_config + self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" + if not multimodal_config.get_limit_per_prompt("image") and \ + not multimodal_config.get_limit_per_prompt("video"): + self.visual = None + else: + self.visual = Qwen3_VisionTransformer( + config.vision_config, + norm_eps=getattr(config, "rms_norm_eps", 1e-6), + quant_config=quant_config, + prefix=maybe_prefix(prefix, "visual"), + use_data_parallel=self.use_data_parallel, + ) + + self.language_model = Qwen3LLMForCausalLM(vllm_config=vllm_config, + prefix=maybe_prefix( + prefix, + "language_model")) + + self.make_empty_intermediate_tensors = ( + self.language_model.make_empty_intermediate_tensors) + + self.use_deepstack = hasattr(config.vision_config, + 'deepstack_visual_indexes') + self.deepstack_num_level = len( + config.vision_config.deepstack_visual_indexes + ) if self.use_deepstack else 0 + # register buffer for deepstack + if self.use_deepstack and self.visual is not None: + self.deepstack_input_embeds = [ + torch.zeros( + vllm_config.scheduler_config.max_num_batched_tokens, + config.text_config.hidden_size) + for _ in range(self.deepstack_num_level) + ] + else: + self.deepstack_input_embeds = None + self.visual_dim = config.vision_config.out_hidden_size + self.multiscale_dim = self.visual_dim * self.deepstack_num_level + + def _get_deepstack_input_embeds(self, + num_tokens: int) -> IntermediateTensors: + # get deepstack_input_embeds from buffer, and clear the buffer + return IntermediateTensors({ + f"deepstack_input_embeds_{idx}": + self.deepstack_input_embeds[idx][:num_tokens] + for idx in range(self.deepstack_num_level) + }) + + def _set_deepstack_input_embeds( + self, deepstack_input_embeds: torch.Tensor) -> None: + # set deepstack_input_embeds to buffer + num_tokens = deepstack_input_embeds.size(1) + if num_tokens > self.deepstack_input_embeds[0].size(0): + self.deepstack_input_embeds = [ + torch.zeros(num_tokens, + self.config.text_config.hidden_size, + device=self.deepstack_input_embeds[0].device, + dtype=self.deepstack_input_embeds[0].dtype) + for _ in range(self.deepstack_num_level) + ] + for idx in range(self.deepstack_num_level): + self.deepstack_input_embeds[idx][:num_tokens].copy_( + deepstack_input_embeds[idx]) + + def _clear_deepstack_input_embeds(self, num_tokens: int) -> None: + # clear deepstack_input_embeds in buffer + if num_tokens > 0: + for idx in range(self.deepstack_num_level): + self.deepstack_input_embeds[idx][:num_tokens].zero_() + + def _validate_and_reshape_mm_tensor(self, mm_input: object, + name: str) -> torch.Tensor: + if not isinstance(mm_input, (torch.Tensor, list)): + raise ValueError(f"Incorrect type of {name}. " + f"Got type: {type(mm_input)}") + if isinstance(mm_input, torch.Tensor): + if mm_input.ndim == 2: + return mm_input + if mm_input.ndim != 3: + raise ValueError(f"{name} should be 2D or batched 3D tensor. " + f"Got ndim: {mm_input.ndim} " + f"(shape={mm_input.shape})") + return torch.concat(list(mm_input)) + else: + return torch.concat(mm_input) + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[Qwen2_5_VLImageInputs]: + pixel_values = kwargs.pop("pixel_values", None) + image_embeds = kwargs.pop("image_embeds", None) + image_grid_thw = kwargs.pop("image_grid_thw", None) + + if pixel_values is None and image_embeds is None: + return None + + if pixel_values is not None: + pixel_values = self._validate_and_reshape_mm_tensor( + pixel_values, "image pixel values") + image_grid_thw = self._validate_and_reshape_mm_tensor( + image_grid_thw, "image grid_thw") + + if not isinstance(pixel_values, (torch.Tensor, list)): + raise ValueError("Incorrect type of image pixel values. " + f"Got type: {type(pixel_values)}") + + return Qwen2_5_VLImagePixelInputs(type="pixel_values", + pixel_values=pixel_values, + image_grid_thw=image_grid_thw) + + if image_embeds is not None: + image_embeds = self._validate_and_reshape_mm_tensor( + image_embeds, "image embeds") + image_grid_thw = self._validate_and_reshape_mm_tensor( + image_grid_thw, "image grid_thw") + + if not isinstance(image_embeds, torch.Tensor): + raise ValueError("Incorrect type of image embeddings. " + f"Got type: {type(image_embeds)}") + return Qwen2_5_VLImageEmbeddingInputs( + type="image_embeds", + image_embeds=image_embeds, + image_grid_thw=image_grid_thw) + + def _parse_and_validate_video_input( + self, **kwargs: object) -> Optional[Qwen2_5_VLVideoInputs]: + pixel_values_videos = kwargs.pop("pixel_values_videos", None) + video_embeds = kwargs.pop("video_embeds", None) + video_grid_thw = kwargs.pop("video_grid_thw", None) + second_per_grid_ts = kwargs.pop("second_per_grid_ts", None) + + if pixel_values_videos is None and video_embeds is None: + return None + + if pixel_values_videos is not None: + pixel_values_videos = self._validate_and_reshape_mm_tensor( + pixel_values_videos, "video pixel values") + video_grid_thw = self._validate_and_reshape_mm_tensor( + video_grid_thw, "video grid_thw") + + return Qwen2_5_VLVideoPixelInputs( + type="pixel_values_videos", + pixel_values_videos=pixel_values_videos, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + ) + + if video_embeds is not None: + video_embeds = self._validate_and_reshape_mm_tensor( + video_embeds, "video embeds") + video_grid_thw = self._validate_and_reshape_mm_tensor( + video_grid_thw, "video grid_thw") + + if not isinstance(video_embeds, torch.Tensor): + raise ValueError("Incorrect type of video embeddings. " + f"Got type: {type(video_embeds)}") + return Qwen2_5_VLVideoEmbeddingInputs( + type="video_embeds", + video_embeds=video_embeds, + video_grid_thw=video_grid_thw) + + def _process_image_input( + self, + image_input: Qwen2_5_VLImageInputs) -> tuple[torch.Tensor, ...]: + + grid_thw = image_input["image_grid_thw"] + assert grid_thw.ndim == 2 + grid_thw_list = grid_thw.tolist() + + if image_input["type"] == "image_embeds": + image_embeds = image_input["image_embeds"].type(self.visual.dtype) + else: + pixel_values = image_input["pixel_values"].type(self.visual.dtype) + if self.use_data_parallel: + return run_dp_sharded_mrope_vision_model(self.visual, + pixel_values, + grid_thw_list, + rope_type="rope_3d") + else: + image_embeds = self.visual(pixel_values, + grid_thw=grid_thw_list) + + # Split concatenated embeddings for each image item. + # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync + merge_size = self.visual.spatial_merge_size + sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) // + (merge_size * merge_size)).tolist() + return image_embeds.split(sizes) + + def _process_video_input( + self, + video_input: Qwen2_5_VLVideoInputs) -> tuple[torch.Tensor, ...]: + + grid_thw = video_input["video_grid_thw"] + assert grid_thw.ndim == 2 + grid_thw_list = grid_thw.tolist() + + if video_input["type"] == "video_embeds": + video_embeds = video_input["video_embeds"].type(self.visual.dtype) + else: + pixel_values_videos = video_input["pixel_values_videos"].type( + self.visual.dtype) + if self.use_data_parallel: + return run_dp_sharded_mrope_vision_model(self.visual, + pixel_values_videos, + grid_thw_list, + rope_type="rope_3d") + else: + video_embeds = self.visual(pixel_values_videos, + grid_thw=grid_thw_list) + + # Split concatenated embeddings for each video item. + # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync + merge_size = self.visual.spatial_merge_size + sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) // + (merge_size * merge_size)).tolist() + return video_embeds.split(sizes) + + def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: + mm_input_by_modality = {} + for input_key in kwargs: + if input_key in ("pixel_values", "image_embeds" + ) and "image" not in mm_input_by_modality: + mm_input_by_modality[ + "image"] = self._parse_and_validate_image_input(**kwargs) + if input_key in ("pixel_values_videos", "video_embeds" + ) and "video" not in mm_input_by_modality: + mm_input_by_modality[ + "video"] = self._parse_and_validate_video_input(**kwargs) + return mm_input_by_modality + + def get_language_model(self) -> torch.nn.Module: + return self.language_model + + def get_multimodal_embeddings( + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + + mm_input_by_modality = self._parse_and_validate_multimodal_inputs( + **kwargs) + if not mm_input_by_modality: + return None + + # The result multimodal_embeddings is tuple of tensors, with each + # tensor correspoending to a multimodal data item (image or video). + multimodal_embeddings: tuple[torch.Tensor, ...] = () + + # NOTE: It is important to iterate over the keys in this dictionary + # to preserve the order of the modalities. + for modality in mm_input_by_modality: + multimodal_input = mm_input_by_modality[modality] + if modality == "image": + vision_embeddings = self._process_image_input(multimodal_input) + multimodal_embeddings += vision_embeddings + if modality == "video": + video_embeddings = self._process_video_input(multimodal_input) + multimodal_embeddings += video_embeddings + return multimodal_embeddings + + def _compute_deepstack_embeds( + self, input_ids: torch.Tensor, inputs_embeds: torch.Tensor, + multimodal_embeddings: MultiModalEmbeddings) -> torch.Tensor: + visual_lens = [ + x.shape[0] if isinstance(x, torch.Tensor) else len(x) + for x in multimodal_embeddings + ] + multimodal_embeddings_cat = torch.cat(multimodal_embeddings, dim=0) + + multimodal_embeddings_main, multimodal_embeddings_multiscale = torch.split( # noqa:E501 + multimodal_embeddings_cat, [self.visual_dim, self.multiscale_dim], + dim=-1) + + multimodal_embeddings = torch.split(multimodal_embeddings_main, + visual_lens, + dim=0) + multimodal_embeddings_multiscale = torch.split( + multimodal_embeddings_multiscale, visual_lens, dim=0) + + deepstack_input_embeds = inputs_embeds.new_zeros( + inputs_embeds.size(0), + self.deepstack_num_level * inputs_embeds.size(1)) + + deepstack_input_embeds = merge_multimodal_embeddings( + input_ids, + deepstack_input_embeds, + multimodal_embeddings_multiscale, + placeholder_token_id=[ + self.config.image_token_id, self.config.video_token_id + ], + ) + deepstack_input_embeds = deepstack_input_embeds.view( + inputs_embeds.shape[0], self.deepstack_num_level, self.visual_dim) + deepstack_input_embeds = deepstack_input_embeds.permute(1, 0, 2) + return deepstack_input_embeds, multimodal_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + ) -> torch.Tensor: + deepstack_input_embeds = None + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + if self.use_deepstack: + deepstack_input_embeds, multimodal_embeddings = self._compute_deepstack_embeds( # noqa:E501 + input_ids, inputs_embeds, multimodal_embeddings) + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + [self.config.image_token_id, self.config.video_token_id]) + + if self.use_deepstack: + if deepstack_input_embeds is None: + deepstack_input_embeds = torch.zeros_like( + inputs_embeds).unsqueeze(0).repeat( + self.deepstack_num_level, 1, 1).contiguous() + self._set_deepstack_input_embeds(deepstack_input_embeds) + + return inputs_embeds + + def get_input_embeddings_v0( + self, + input_ids: torch.Tensor, + image_input: Optional[Qwen2_5_VLImageInputs] = None, + video_input: Optional[Qwen2_5_VLVideoInputs] = None, + ) -> torch.Tensor: + inputs_embeds = self.get_input_embeddings(input_ids) + + if self.use_deepstack: + visual_dim = inputs_embeds.shape[-1] + deepstack_input_embeds = None + if image_input is not None or video_input is not None: + deepstack_input_embeds = torch.zeros_like( + inputs_embeds).unsqueeze(1).repeat( + 1, self.deepstack_num_level, 1).flatten(1) + + if image_input is not None: + image_embeds = self._process_image_input(image_input) + if self.use_deepstack: + image_embeds = torch.cat(image_embeds) + + image_embeds, image_embeds_multiscale = image_embeds.split( + [visual_dim, visual_dim * self.deepstack_num_level], + dim=-1) + + deepstack_input_embeds = merge_multimodal_embeddings( + input_ids, + deepstack_input_embeds, + image_embeds_multiscale, + placeholder_token_id=self.config.image_token_id, + ) + + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + image_embeds, + placeholder_token_id=self.config.image_token_id, + ) + + if video_input is not None: + video_embeds = self._process_video_input(video_input) + if self.use_deepstack: + video_embeds = torch.cat(video_embeds) + + video_embeds, video_embeds_multiscale = video_embeds.split( + [visual_dim, visual_dim * self.deepstack_num_level], + dim=-1) + + deepstack_input_embeds = merge_multimodal_embeddings( + input_ids, + deepstack_input_embeds, + video_embeds_multiscale, + placeholder_token_id=self.config.video_token_id, + ) + + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + video_embeds, + placeholder_token_id=self.config.video_token_id, + ) + + if self.use_deepstack and deepstack_input_embeds is not None: + deepstack_input_embeds = deepstack_input_embeds.view( + inputs_embeds.shape[0], self.deepstack_num_level, + visual_dim).permute(1, 0, 2).contiguous() + self._set_deepstack_input_embeds(deepstack_input_embeds) + return inputs_embeds + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs: object, + ) -> Union[torch.Tensor, IntermediateTensors]: + """Run forward pass for Qwen3VL. + + Args: + input_ids: Flattened (concatenated) input_ids corresponding to a + batch. + positions: Flattened (concatenated) position ids corresponding to a + batch. + **NOTE**: If mrope is enabled (default setting for Qwen3VL + opensource models), the shape will be `(3, seq_len)`, + otherwise it will be `(seq_len,). + intermediate_tensors: Intermediate tensors from previous pipeline + stages. + inputs_embeds: Pre-computed input embeddings. + **kwargs: Additional keyword arguments including: + - pixel_values: Pixel values to be fed to a model. + `None` if no images are passed. + - image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in + LLM. `None` if no images are passed. + - pixel_values_videos: Pixel values of videos to be fed to a + model. `None` if no videos are passed. + - video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in + LLM. `None` if no videos are passed. + """ + + if intermediate_tensors is not None: + inputs_embeds = None + + # NOTE: In v1, inputs_embeds is always generated at model runner from + # `get_multimodal_embeddings` and `get_input_embeddings`, this + # condition is only for v0 compatibility. + elif inputs_embeds is None: + image_input = self._parse_and_validate_image_input(**kwargs) + video_input = self._parse_and_validate_video_input(**kwargs) + + if image_input is None and video_input is None: + inputs_embeds = None + else: + if uses_mrope(self.config): + assert positions.ndim == 2 and positions.size(0) == 3, ( + "multimodal section rotary embedding requires " + f"(3, seq_len) positions, but got {positions.size()}") + inputs_embeds = self.get_input_embeddings_v0( + input_ids, + image_input=image_input, + video_input=video_input) + input_ids = None + + if self.use_deepstack and inputs_embeds is not None and get_pp_group( + ).is_first_rank: + deepstack_input_embeds = self._get_deepstack_input_embeds( + inputs_embeds.size(0)) + else: + deepstack_input_embeds = None + + hidden_states = self.language_model.model( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + # args for deepstack + deepstack_input_embeds=deepstack_input_embeds, + ) + + if inputs_embeds is not None and get_pp_group().is_first_rank: + self._clear_deepstack_input_embeds(inputs_embeds.size(0)) + + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + ) -> Optional[torch.Tensor]: + return self.language_model.compute_logits(hidden_states) + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + + skip_prefixes = [] + if self.visual is None: + skip_prefixes.extend(["visual."]) + loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) + + def get_mm_mapping(self) -> MultiModelKeys: + """ + Get the module prefix in multimodal models + """ + return MultiModelKeys.from_string_field( + language_model="language_model", + connector="model.visual.merger", + tower_model="model.visual.", + ) diff --git a/vllm_kunlun/models/qwen3_vl_moe.py b/vllm_kunlun/models/qwen3_vl_moe.py new file mode 100644 index 0000000..6d1685b --- /dev/null +++ b/vllm_kunlun/models/qwen3_vl_moe.py @@ -0,0 +1,358 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright 2025 The vLLM team. +# Copyright 2025 The Qwen Team. +# Copyright 2025 The HuggingFace Inc. team. +# All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only Qwen3-VL-MoE model compatible with HuggingFace weights.""" +import typing +from collections.abc import Iterable +from typing import Callable, Optional, Union + +import torch +from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import ( + Qwen3VLMoeConfig) + +from vllm.compilation.decorators import support_torch_compile +from vllm.config import VllmConfig +from vllm.distributed import get_pp_group +from vllm.logger import init_logger +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, maybe_remap_kv_scale_name) +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.sequence import IntermediateTensors + +from .qwen3_moe import Qwen3MoeForCausalLM, Qwen3MoeModel +from .qwen3_vl import (Qwen3_VisionTransformer, Qwen3VLDummyInputsBuilder, + Qwen3VLForConditionalGeneration, + Qwen3VLMultiModalProcessor, Qwen3VLProcessingInfo) +from vllm.model_executor.models.utils import is_pp_missing_parameter, maybe_prefix + +logger = init_logger(__name__) + + +class Qwen3VLMoeProcessingInfo(Qwen3VLProcessingInfo): + + def get_hf_config(self): + return self.ctx.get_hf_config(Qwen3VLMoeConfig) + + +@support_torch_compile( + dynamic_arg_dims={ + "input_ids": 0, + # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl, + # otherwise (seq_len, ). + "positions": -1, + "intermediate_tensors": 0, + "inputs_embeds": 0, + # the same shape as input_embeds + "deepstack_input_embeds": 0 + }) +class Qwen3MoeLLMModel(Qwen3MoeModel): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + if not get_pp_group().is_first_rank: + assert self.start_layer >= len( + vllm_config.model_config.hf_config.vision_config. + deepstack_visual_indexes), ( + "start_layer should be greater than or equal to " + "len(deepstack_visual_indexes)") + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + deepstack_input_embeds: Optional[IntermediateTensors] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + for layer_idx, layer in enumerate( + self.layers[self.start_layer:self.end_layer]): + layer_idx = layer_idx + self.start_layer + + hidden_states, residual = layer( + positions, + hidden_states, + residual, + ) + + if deepstack_input_embeds is not None and \ + layer_idx in range(0, len(deepstack_input_embeds)): + hidden_states = hidden_states + deepstack_input_embeds[ + f"deepstack_input_embeds_{layer_idx}"] + + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual + }) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + def load_fused_expert_weights(self, name: str, params_dict: dict, + loaded_weight: torch.Tensor, shard_id: str, + num_experts: int) -> bool: + param = params_dict[name] + weight_loader = typing.cast(Callable[..., bool], param.weight_loader) + loaded_local_expert = False + for expert_id in range(num_experts): + curr_expert_weight = loaded_weight[expert_id] + success = weight_loader(param, + curr_expert_weight, + name, + shard_id, + expert_id, + return_success=True) + if success: + loaded_local_expert = True + + return loaded_local_expert + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + # Skip loading extra parameters for GPTQ/modelopt models. + ignore_suffixes = (".bias", "_bias", ".k_scale", "_k_scale", + ".v_scale", "_v_scale", ".weight_scale", + "_weight_scale", ".input_scale", "_input_scale") + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + expert_params_mapping = self.get_expert_mapping() + is_fused_expert = False + fused_expert_params_mapping = [ + ("experts.w13_weight", "experts.gate_up_proj", 0, "w1"), + ("experts.w2_weight", "experts.down_proj", 0, "w2"), + ] + num_experts = self.config.get_text_config().num_experts + for name, loaded_weight in weights: + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if ("experts.gate_up_proj" in name + or "experts.down_proj" in name): + is_fused_expert = True + expert_params_mapping = fused_expert_params_mapping + + # Skip non-stacked layers and experts (experts handled below). + if weight_name not in name: + continue + # We have mlp.experts[0].gate_proj in the checkpoint. + # Since we handle the experts below in expert_params_mapping, + # we need to skip here BEFORE we update the name, otherwise + # name will be updated to mlp.experts[0].gate_up_proj, which + # will then be updated below in expert_params_mapping + # for mlp.experts[0].gate_gate_up_proj, which breaks load. + if "mlp.experts" in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra parameters for GPTQ/modelopt models. + if name.endswith(ignore_suffixes) and name not in params_dict: + continue + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + if name.endswith("scale"): + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + if name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + if weight_loader == default_weight_loader: + weight_loader(param, loaded_weight) + else: + weight_loader(param, loaded_weight, shard_id) + break + else: + is_expert_weight = False + for mapping in expert_params_mapping: + param_name, weight_name, expert_id, shard_id = mapping + if weight_name not in name: + continue + # Anyway, this is an expert weight and should not be + # attempted to load as other weights later + is_expert_weight = True + name_mapped = name.replace(weight_name, param_name) + if is_pp_missing_parameter(name_mapped, self): + continue + if is_fused_expert: + loaded_weight = loaded_weight.transpose(-1, + -2) # no bias + if "experts.gate_up_proj" in name: + loaded_weight = loaded_weight.chunk(2, dim=-2) + success_w1 = self.load_fused_expert_weights( + name_mapped, params_dict, loaded_weight[0], + "w1", num_experts) + success_w3 = self.load_fused_expert_weights( + name_mapped, params_dict, loaded_weight[1], + "w3", num_experts) + success = success_w1 and success_w3 + else: + # down_proj + success = self.load_fused_expert_weights( + name_mapped, params_dict, loaded_weight, + shard_id, num_experts) + else: + # Skip loading extra parameters for GPTQ/modelopt models + if name_mapped.endswith( + ignore_suffixes + ) and name_mapped not in params_dict: + continue + param = params_dict[name_mapped] + # We should ask the weight loader to return success or + # not here since otherwise we may skip experts with + # other available replicas. + weight_loader = typing.cast(Callable[..., bool], + param.weight_loader) + success = weight_loader(param, + loaded_weight, + name_mapped, + shard_id=shard_id, + expert_id=expert_id, + return_success=True) + if success: + name = name_mapped + break + else: + if is_expert_weight: + # We've checked that this is an expert weight + # However it's not mapped locally to this rank + # So we simply skip it + continue + # Skip loading extra parameters for GPTQ/modelopt models. + if name.endswith( + ignore_suffixes) and name not in params_dict: + continue + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + # Remapping the name of FP8 kv-scale. + if name.endswith("kv_scale"): + remapped_kv_scale_name = name.replace( + ".kv_scale", ".attn.kv_scale") + if remapped_kv_scale_name not in params_dict: + logger.warning_once( + "Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.", # noqa: E501 + name, + remapped_kv_scale_name, + ) + continue + else: + name = remapped_kv_scale_name + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class Qwen3MoeLLMForCausalLM(Qwen3MoeForCausalLM): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super(Qwen3MoeForCausalLM, self).__init__() + self.config = vllm_config.model_config.hf_config.text_config + self.quant_config = vllm_config.quant_config + self.model = Qwen3MoeLLMModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + self.lm_head = ParallelLMHead(self.config.vocab_size, + self.config.hidden_size, + quant_config=self.quant_config) + if self.config.tie_word_embeddings: + self.lm_head.weight = self.model.embed_tokens.weight + self.logits_processor = LogitsProcessor(self.config.vocab_size) + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + + +@MULTIMODAL_REGISTRY.register_processor(Qwen3VLMultiModalProcessor, + info=Qwen3VLMoeProcessingInfo, + dummy_inputs=Qwen3VLDummyInputsBuilder) +class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super(Qwen3VLForConditionalGeneration, self).__init__() + config: Qwen3VLMoeConfig = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + multimodal_config = vllm_config.model_config.multimodal_config + + self.config = config + self.multimodal_config = multimodal_config + self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" + + if not multimodal_config.get_limit_per_prompt("image") and \ + not multimodal_config.get_limit_per_prompt("video"): + self.visual = None + else: + self.visual = Qwen3_VisionTransformer( + config.vision_config, + norm_eps=getattr(config, "rms_norm_eps", 1e-6), + quant_config=quant_config, + prefix=maybe_prefix(prefix, "visual"), + use_data_parallel=self.use_data_parallel, + ) + + self.language_model = Qwen3MoeLLMForCausalLM(vllm_config=vllm_config, + prefix=maybe_prefix( + prefix, + "language_model")) + + self.make_empty_intermediate_tensors = ( + self.language_model.make_empty_intermediate_tensors) + + self.use_deepstack = hasattr(config.vision_config, + 'deepstack_visual_indexes') + self.deepstack_num_level = len( + config.vision_config.deepstack_visual_indexes + ) if self.use_deepstack else 0 + # register buffer for deepstack + if self.use_deepstack and self.visual is not None: + self.deepstack_input_embeds = [ + torch.zeros( + vllm_config.scheduler_config.max_num_batched_tokens, + config.text_config.hidden_size) + for _ in range(self.deepstack_num_level) + ] + else: + self.deepstack_input_embeds = None + self.visual_dim = config.vision_config.out_hidden_size + self.multiscale_dim = self.visual_dim * self.deepstack_num_level diff --git a/vllm_kunlun/models/seed_oss.py b/vllm_kunlun/models/seed_oss.py new file mode 100644 index 0000000..2ec32b3 --- /dev/null +++ b/vllm_kunlun/models/seed_oss.py @@ -0,0 +1,500 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright 2025 The Seed team. +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only SeedOss model compatible with HuggingFace weights.""" + +from collections.abc import Iterable +from itertools import islice + +import torch +from torch import nn +from transformers import PretrainedConfig as SeedOssConfig + +from vllm.attention import AttentionType +from vllm_kunlun.ops.attention.layer import Attention +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, VllmConfig +from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size +from vllm.logger import init_logger +from vllm_kunlun.ops.activation import SiluAndMul +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import ( + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear, +) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm_kunlun.ops.vocab_parallel_embedding import ( + ParallelLMHead, + VocabParallelEmbedding, +) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, + maybe_remap_kv_scale_name, +) +from vllm.sequence import IntermediateTensors + +from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP +from vllm.model_executor.models.utils import ( + AutoWeightsLoader, + PPMissingLayer, + is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, + make_layers, + maybe_prefix, +) + +logger = init_logger(__name__) + + +class SeedOssMLP(nn.Module): + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__() + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, + [intermediate_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj", + ) + self.down_proj = RowParallelLinear( + intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.down_proj", + ) + if hidden_act != "silu": + raise ValueError( + f"Unsupported activation: {hidden_act}. Only silu is supported for now." + ) + self.act_fn = SiluAndMul() + + def forward(self, x): + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +class SeedOssAttention(nn.Module): + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + head_dim: int, + max_position: int = 4096 * 32, + rope_theta: float = 10000, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + rope_scaling: tuple | None = None, + prefix: str = "", + attn_type: str = AttentionType.DECODER, + ) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + self.head_dim = head_dim + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.rope_theta = rope_theta + + self.qkv_proj = QKVParallelLinear( + hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position, + base=self.rope_theta, + rope_scaling=rope_scaling, + ) + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config, + attn_type=attn_type, + prefix=f"{prefix}.attn", + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v) + output, _ = self.o_proj(attn_output) + return output + + +class SeedOssDecoderLayer(nn.Module): + def __init__( + self, + config: SeedOssConfig, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + # Requires transformers > 4.32.0 + rope_theta = getattr(config, "rope_theta", 1000000) + rope_scaling = getattr(config, "rope_scaling", None) + + # By default, SeedOss uses causal attention as it is a + # decoder-only model. + # You can override the HF config with `is_causal=False` to enable + # bidirectional attention, which is used in some embedding models + if getattr(config, "is_causal", True): + attn_type = AttentionType.DECODER + else: + attn_type = AttentionType.ENCODER_ONLY + + self.self_attn = SeedOssAttention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + max_position=config.max_position_embeddings, + num_kv_heads=config.num_key_value_heads, + head_dim=config.head_dim, + rope_theta=rope_theta, + cache_config=cache_config, + quant_config=quant_config, + rope_scaling=rope_scaling, + prefix=f"{prefix}.self_attn", + attn_type=attn_type, + ) + self.mlp = SeedOssMLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: torch.Tensor | None, + ) -> tuple[torch.Tensor, torch.Tensor]: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm(hidden_states, residual) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + ) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) + hidden_states = self.mlp(hidden_states) + return hidden_states, residual + + +@support_torch_compile( + dynamic_arg_dims={ + "input_ids": 0, + "positions": -1, + "intermediate_tensors": 0, + "inputs_embeds": 0, + } +) +class SeedOssModel(nn.Module): + def __init__( + self, + *, + vllm_config: VllmConfig, + prefix: str = "", + decoder_layer_type: type[nn.Module] = SeedOssDecoderLayer, + ): + super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + + # TODO (@robertgshaw2): see if this can be moved out + if cache_config.sliding_window is not None and hasattr( + config, "max_window_layers" + ): + assert config.max_window_layers == config.num_hidden_layers, ( + "Sliding window for some but all layers is not supported. " + "This model uses sliding window but `max_window_layers` = {} " + "is less than `num_hidden_layers` = {}. Please open an issue " + "to discuss this feature.".format( + config.max_window_layers, + config.num_hidden_layers, + ) + ) + + self.config = config + self.quant_config = quant_config + self.vocab_size = config.vocab_size + + if get_pp_group().is_first_rank or ( + config.tie_word_embeddings and get_pp_group().is_last_rank + ): + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.embed_tokens", + ) + else: + self.embed_tokens = PPMissingLayer() + + # Use the provided decoder layer type or default to SeedDecoderLayer + decoder_layer_type = decoder_layer_type or SeedOssDecoderLayer + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: decoder_layer_type( + config=config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix, + ), + prefix=f"{prefix}.layers", + ) + + self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size + ) + if get_pp_group().is_last_rank: + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + else: + self.norm = PPMissingLayer() + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + ) -> torch.Tensor | IntermediateTensors: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + for layer in islice(self.layers, self.start_layer, self.end_layer): + hidden_states, residual = layer( + positions, + hidden_states, + residual, + ) + if not get_pp_group().is_last_rank: + return IntermediateTensors( + {"hidden_states": hidden_states, "residual": residual} + ) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + if self.quant_config is not None and ( + scale_name := self.quant_config.get_cache_scale(name) + ): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + loaded_weight = ( + loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] + ) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class SeedOssForCausalLM(nn.Module, SupportsLoRA, SupportsPP): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + + self.config = config + self.lora_config = lora_config + + self.quant_config = quant_config + self.model = SeedOssModel( + vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") + ) + + if get_pp_group().is_last_rank: + if config.tie_word_embeddings: + self.lm_head = self.model.embed_tokens + else: + self.lm_head = ParallelLMHead( + config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), + ) + else: + self.lm_head = PPMissingLayer() + + self.logits_processor = LogitsProcessor(config.vocab_size) + + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors + ) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + ) -> torch.Tensor | IntermediateTensors: + hidden_states = self.model( + input_ids, positions, intermediate_tensors, inputs_embeds + ) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + ) -> torch.Tensor | None: + logits = self.logits_processor(self.lm_head, hidden_states) + return logits + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader( + self, + skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), + ) + return loader.load_weights(weights) \ No newline at end of file diff --git a/vllm_kunlun/ops/__init__.py b/vllm_kunlun/ops/__init__.py index fabfcaa..0412874 100644 --- a/vllm_kunlun/ops/__init__.py +++ b/vllm_kunlun/ops/__init__.py @@ -12,10 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# This file is a part of the vllm-kunlun project. +# This file is a part of the vllm-ascend project. # import vllm_kunlun.ops.rotary_embedding -import vllm_kunlun.ops.layernorm -import vllm_kunlun.ops.quantization.awq -import vllm_kunlun.ops.quantization.gptq +import vllm_kunlun.ops.layernorm \ No newline at end of file diff --git a/vllm_kunlun/ops/_kunlun_ops.py b/vllm_kunlun/ops/_kunlun_ops.py index 57515e1..013f75f 100644 --- a/vllm_kunlun/ops/_kunlun_ops.py +++ b/vllm_kunlun/ops/_kunlun_ops.py @@ -1,20 +1,3 @@ -# -# Copyright (c) 2025 Baidu, Inc. All Rights Reserved. -# -# This file is a part of the vllm-kunlun project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - """kunlun custom op entry""" import torch_xmlir import torch @@ -29,7 +12,6 @@ logger = init_logger(__name__) try: import xtorch_ops - logger.info(f"Load custom ops library success!") except ImportError as e: logger.warning("Import error msg: %s", e.msg) @@ -37,15 +19,13 @@ except ImportError as e: _per_token_smooth_quant = True - def is_per_token_smooth_quant(): - """is per token smooth quant""" + """ is per token smooth quant """ return _per_token_smooth_quant class KunlunOps: """KunlunOps""" - # Attention ops @staticmethod def paged_attention_v1( @@ -70,9 +50,10 @@ class KunlunOps: blocksparse_vert_stride, blocksparse_block_size, blocksparse_head_sliding_step, - alibi_sqrt=False, - ): - """PagedAttentionV1""" + alibi_sqrt=False + ): + """ PagedAttentionV1 """ + # block_size = value_cache.shape[2] xtorch_ops.paged_attention( x=query, k_cache=key_cache, @@ -83,7 +64,7 @@ class KunlunOps: is_context=is_context, is_causal=True, out=output, - vo_head_dim=128, + vo_head_dim=128 ) @staticmethod @@ -112,9 +93,10 @@ class KunlunOps: blocksparse_vert_stride, blocksparse_block_size, blocksparse_head_sliding_step, - alibi_sqrt=False, - ): - """PagedAttentionV2""" + alibi_sqrt=False + ): + """ PagedAttentionV2 """ + # block_size = value_cache.shape[2] xtorch_ops.paged_attention( x=query, k_cache=key_cache, @@ -125,28 +107,31 @@ class KunlunOps: is_context=is_context, is_causal=True, out=output, - vo_head_dim=128, + vo_head_dim=128 ) + # Activation ops @staticmethod - def silu_and_mul(out: torch.Tensor, x: torch.Tensor): - """silu and mul""" + def silu_and_mul(out: torch.Tensor, + x: torch.Tensor): + """ silu and mul """ xtorch_ops.silu_and_mul( x, axis=-1, turn=True, out=out, - ) + ) # Activation ops @staticmethod - def quick_gelu(out: torch.Tensor, x: torch.Tensor): - """quick gelu""" + def quick_gelu(out: torch.Tensor, + x: torch.Tensor): + """ quick gelu """ xtorch_ops.quick_gelu( x, out=out, - ) + ) # Layernorm @staticmethod @@ -157,7 +142,9 @@ class KunlunOps: epsilon, ): """rms_norm""" - xtorch_ops.rmsnorm(x, weight.to(torch.float32), epsilon, out=out) + xtorch_ops.rmsnorm( + x, weight.to(torch.float32), epsilon, out=out + ) @staticmethod def fused_add_rms_norm( @@ -175,11 +162,16 @@ class KunlunOps: residual.copy_(fused_input, non_blocking=True) x.copy_(output) + # Rotary embedding @staticmethod def rotary_embedding( - positions, query, key, head_size, cos_sin_cache, is_neox_style - ): + positions, + query, + key, + head_size, + cos_sin_cache, + is_neox_style): """ refactor RotaryEmbedding forward function """ @@ -196,43 +188,66 @@ class KunlunOps: key_x = key_x.unsqueeze(0) xtorch_ops.rotary_embedding_gptj( - positions, query_x, key_x, head_size, cos_sin_cache - ) + positions, + query_x, + key_x, + head_size, + cos_sin_cache) query.data = query_x - key.data = key_x + key.data = key_x if query_x_dim != query_x.dim(): query_x = query_x.unsqueeze(0) key_x = key_x.unsqueeze(0) return query, key - + # TODO: need opt if cos_sin_cache.dim() == 4: max_seq_len = cos_sin_cache.shape[2] head_dim = cos_sin_cache.shape[3] - cos_sin_cache = cos_sin_cache.squeeze(0).squeeze( - 0 - ) # Remove the first two dimensions [1,1,L,D] -> [L,D] + cos_sin_cache = cos_sin_cache.squeeze(0).squeeze(0) # 移除前两个维度 [1,1,L,D] -> [L,D] cos_sin_cache = cos_sin_cache.view(max_seq_len, 1, head_dim) - - # Reshape query and key + + # 重塑 query 和 key 的形状 num_tokens = query_x.shape[0] num_heads = query_x.shape[1] // head_size num_kv_heads = key_x.shape[1] // head_size + + # # [num_tokens, num_heads * head_size] -> [num_tokens, num_heads, head_size] + # query_x = query_x.view(num_tokens, num_heads, head_size) + # # [num_tokens, num_kv_heads * head_size] -> [num_tokens, num_kv_heads, head_size] + # key_x = key_x.view(num_tokens, num_kv_heads, head_size) + + # # 确保形状正确 + # assert query_x.shape == (num_tokens, num_heads, head_size), \ + # f"Expected query shape [{num_tokens}, {num_heads}, {head_size}], got {query_x.shape}" + # assert key_x.shape == (num_tokens, num_kv_heads, head_size), \ + # f"Expected key shape [{num_tokens}, {num_kv_heads}, {head_size}], got {key_x.shape}" torch.ops._C.rotary_embedding( - positions, query_x, key_x, head_size, cos_sin_cache, is_neox_style - ) + positions, + query_x, + key_x, + head_size, + cos_sin_cache, + is_neox_style) query_x = query_x.view(num_tokens, num_heads * head_size) key_x = key_x.view(num_tokens, num_kv_heads * head_size) + # query.data = query_x + # key.data = key_x return query_x, key_x # Rotary embedding @staticmethod def mrotary_embedding( - positions, mrope_section, query, key, head_size, cos_sin_cache, is_neox_style - ): + positions, + mrope_section, + query, + key, + head_size, + cos_sin_cache, + is_neox_style): """ refactor RotaryEmbedding forward function """ @@ -241,21 +256,35 @@ class KunlunOps: query_x_dim = query_x.dim() assert is_neox_style xtorch_ops.mrotary_embedding_neox( - positions, query_x, key_x, head_size, cos_sin_cache, mrope_section - ) + positions, + query_x, + key_x, + head_size, + cos_sin_cache, + mrope_section) query.data = query_x - key.data = key_x + key.data = key_x return query, key @staticmethod - def swap_blocks(src, dst, block_mapping): - """swap_blocks""" - xtorch_ops.swap_blocks(src, dst, block_mapping) + def swap_blocks( + src, + dst, + block_mapping): + """ swap_blocks """ + xtorch_ops.swap_blocks( + src, + dst, + block_mapping + ) @staticmethod - def copy_blocks(key_caches, value_caches, block_mapping): - """copy_blocks""" + def copy_blocks( + key_caches, + value_caches, + block_mapping): + """ copy_blocks """ for i in range(len(key_caches)): key_caches[i] = key_caches[i].contiguous() value_caches[i] = value_caches[i].contiguous() @@ -273,9 +302,16 @@ class KunlunOps: value_cache, slot_mapping, kv_cache_dtype, - ): - """reshape_and_cache""" - xtorch_ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping) + ): + """ reshape_and_cache """ + # slot_mapping_cast = slot_mapping.to(torch.int32) + xtorch_ops.reshape_and_cache( + key, + value, + key_cache, + value_cache, + slot_mapping + ) @staticmethod def multi_query_kv_attention( @@ -284,7 +320,7 @@ class KunlunOps: query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, - **kargs, + **kargs ) -> torch.Tensor: """ query: shape = [num_prompt_tokens, num_heads, head_size] @@ -303,7 +339,7 @@ class KunlunOps: KVh = key.size(2) if KVh != Qh: repeat = Qh // KVh - key = key.repeat_interleave(repeat, dim=2) # [B, T, Qh, Hd] + key = key.repeat_interleave(repeat, dim=2) # [B, T, Qh, Hd] value = value.repeat_interleave(repeat, dim=2) xtorch_ops.attention( q=query, @@ -318,132 +354,85 @@ class KunlunOps: return output @staticmethod - def quant_fusedresidual_rmsnorm_op( - x, residual, weight, bias, scale_to_int, eps, dyn_scale: bool, type: int = 1 - ): + def quant_fusedresidual_rmsnorm_op(x, + residual, + weight, + bias, + scale_to_int, + eps, + dyn_scale: bool, + type: int = 1): """Quantized fused residual layer normalization""" out = torch.empty_like(x, dtype=torch.int8) if is_per_token_smooth_quant(): - out_scale = torch.empty( - x.shape[:-1], device=x.device, dtype=torch.float - ).unsqueeze(-1) + out_scale = torch.empty(x.shape[:-1], device=x.device, dtype=torch.float).unsqueeze(-1) else: out_scale = torch.empty(12, device=x.device, dtype=torch.float) - xtorch_ops.quant_fusedresidual_rmsnorm( - x, - residual, - weight, - bias, - eps, - out=out, - out_scale=out_scale, - residual_tensor=residual, - ) + xtorch_ops.quant_fusedresidual_rmsnorm(x, residual, weight, bias, eps, + out=out, out_scale=out_scale , residual_tensor=residual) if residual is None: return out, out_scale return out, out_scale, residual @staticmethod - def quant_rmsnorm_op( - x, weight, bias, scale_to_int, eps, dyn_scale: bool, type: int = 1 - ): + def quant_rmsnorm_op(x, + weight, + bias, + scale_to_int, + eps, + dyn_scale : bool, + type: int = 1): """Quantized RMSNorm""" out = torch.empty_like(x, dtype=torch.int8) if is_per_token_smooth_quant(): - out_scale = torch.empty( - x.shape[:-1], device=x.device, dtype=torch.float - ).unsqueeze(-1) + out_scale = torch.empty(x.shape[:-1], device=x.device, dtype=torch.float).unsqueeze(-1) else: out_scale = torch.empty(12, device=x.device, dtype=torch.float) - xtorch_ops.quant_rmsnorm(x, weight, bias, eps, out=out, out_scale=out_scale) + xtorch_ops.quant_rmsnorm(x, weight, bias, eps, + out=out, out_scale=out_scale) return out, out_scale @staticmethod - def smooth_quant_matmul_column_row_kernels( - input_tensor, - weight, - smoother, - input_scale, - weight_scale, - perTokenScaling, - perChannelScaling, - otype, - ): + def smooth_quant_matmul_column_row_kernels(input_tensor, + weight, + smoother, + input_scale, + weight_scale, + perTokenScaling, + perChannelScaling, + otype): """smooth_quant_matmul_column_row_kernels""" input_shape = input_tensor.shape weight_shape = weight.shape if input_tensor.dim() == 3: input_tensor = input_tensor.reshape(-1, input_shape[-1]) - out = torch.empty( - (input_shape[0] * input_shape[1], weight_shape[0]), - dtype=torch.float16, - device=weight.device, - ) + out = torch.empty((input_shape[0] * input_shape[1], + weight_shape[0]), + dtype=torch.float16, + device=weight.device) output_bs_shape = [input_shape[0], input_shape[1]] elif input_tensor.dim() == 2: - out = torch.empty( - (input_shape[0], weight_shape[0]), - dtype=torch.float16, - device=weight.device, - ) + out = torch.empty((input_shape[0], weight_shape[0]), + dtype=torch.float16, + device=weight.device) output_bs_shape = [-1] - xtorch_ops.smooth_quant_matmul_column_row_kernels( - input_tensor, - weight, - smoother, - input_scale, - weight_scale, - perTokenScaling, - perChannelScaling, - out=out, - ) + xtorch_ops.smooth_quant_matmul_column_row_kernels(input_tensor, + weight, smoother, + input_scale, + weight_scale, + perTokenScaling, + perChannelScaling, + out=out) out = out.view(*output_bs_shape, weight_shape[0]) return out - @staticmethod - def fused_moe( - hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - gating_output: torch.Tensor, - linear_weights: torch.Tensor, - topk: int, - renormalize: bool, - inplace: bool = False, - use_grouped_topk: bool = False, - num_expert_group: Optional[int] = None, - topk_group: Optional[int] = None, - w1_bias: Optional[torch.Tensor] = None, - w2_bias: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - """fused_moe""" - output = torch.empty( - hidden_states.shape, dtype=hidden_states.dtype, device=hidden_states.device - ) - expert_num = linear_weights.shape[0] - - torch.ops._C.moe_ffn_block( - x=hidden_states, - gate_w=linear_weights, - inter_w=w1, - output_w=w2, - expert_num=expert_num, - moe_top_k=topk, - topk_group=topk_group, - renormalize=renormalize, - use_grouped_topk=use_grouped_topk, - expert_group_num=num_expert_group, - out=output, - ) - return output - @staticmethod def fused_moe_ep( hidden_states: torch.Tensor, @@ -460,23 +449,23 @@ class KunlunOps: topk_group: Optional[int] = None, w1_bias: Optional[torch.Tensor] = None, w2_bias: Optional[torch.Tensor] = None, - ) -> torch.Tensor: + ) -> torch.Tensor: x = hidden_states - batch, hidden_size = x.shape + batch, hidden_size = x.shape num_local_experts, up_gate_size, _ = w13_weight.shape - router_logits = x.to(linear_weights.dtype) @ linear_weights.T - - topk_weights = torch.empty( - batch, top_k, dtype=router_logits.dtype, device=router_logits.device - ) - topk_ids = torch.empty( - batch, top_k, dtype=torch.int32, device=router_logits.device - ) - block_static = torch.empty(0, dtype=torch.int32, device=router_logits.device) - torch.ops._C.moe_softmax_topk( - router_logits, topk_weights, topk_ids, block_static - ) + router_logits = x.to(linear_weights.dtype)@linear_weights.T + + topk_weights = torch.empty(batch, + top_k, + dtype=router_logits.dtype, + device=router_logits.device) + topk_ids = torch.empty(batch, + top_k, + dtype=torch.int32, + device=router_logits.device) + block_static = torch.empty(0, dtype=torch.int32,device=router_logits.device) + torch.ops._C.moe_softmax_topk(router_logits, topk_weights, topk_ids, block_static) if renormalize: topk_weights = topk_weights / topk_weights.sum(1, keepdim=True) @@ -490,22 +479,50 @@ class KunlunOps: selected_token = topk_ids_flat == experts_id if selected_token.sum(): cur_token = repeat_x[selected_token] - up_gate = torch.empty( - selected_token.sum(), - up_gate_size // 2, - dtype=cur_token.dtype, - device=cur_token.device, - ) - torch.ops._C.swiglu(cur_token @ w13_weight[i].T, up_gate) + up_gate = torch.empty(selected_token.sum(), up_gate_size//2, + dtype=cur_token.dtype, device=cur_token.device) + torch.ops._C.swiglu(cur_token@ w13_weight[i].T, up_gate) out[selected_token] = up_gate @ w2_weight[i].T - output = ( - (out.view(batch, top_k, hidden_size) * topk_weights.unsqueeze(2)) - .sum(dim=1) - .to(x.dtype) - ) + output = (out.view(batch, top_k, hidden_size) * topk_weights.unsqueeze(2)).sum(dim=1).to(x.dtype) return output + @staticmethod + def fused_moe( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + gating_output: torch.Tensor, + linear_weights: torch.Tensor, + topk: int, + renormalize: bool, + inplace: bool = False, + use_grouped_topk: bool = False, + num_expert_group: Optional[int] = None, + topk_group: Optional[int] = None, + w1_bias: Optional[torch.Tensor] = None, + w2_bias: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """fused_moe""" + output = torch.empty(hidden_states.shape, dtype=hidden_states.dtype, + device=hidden_states.device) + expert_num = linear_weights.shape[0] + + torch.ops._C.moe_ffn_block( + x=hidden_states, + gate_w=linear_weights, + inter_w=w1, + output_w=w2, + expert_num=expert_num, + moe_top_k=topk, + topk_group=topk_group, + renormalize=renormalize, + use_grouped_topk=use_grouped_topk, + expert_group_num=num_expert_group, + out=output, + ) + return output + @staticmethod def fused_multi_head_latent_page_attention( hidden_states: torch.Tensor, @@ -538,11 +555,10 @@ class KunlunOps: prompt_lods_cpu: torch.Tensor, k_cache: torch.Tensor, v_cache: torch.Tensor, - ) -> torch.Tensor: + ) -> torch.Tensor: """mla pa block""" - output = torch.empty( - hidden_states.shape, dtype=hidden_states.dtype, device=hidden_states.device - ) + output = torch.empty(hidden_states.shape, dtype=hidden_states.dtype, + device=hidden_states.device) xtorch_ops.xft_multi_head_latent_page_attention_block( hidden_states, q_lora_rank, @@ -579,3 +595,42 @@ class KunlunOps: v_cache=v_cache, ) return output + + + def fused_gdn_gating( + A_log: torch.Tensor, + a: torch.Tensor, + dt_bias: torch.Tensor, + beta: float = 1.0, + threshold: float = 20.0, + ) -> torch.Tensor: + """fused_gdn_gating""" + output = xtorch_ops.fused_gdn_gating( + A_log, + a, + dt_bias, + ) + return output + + + def fused_recurrent_gated_delta_rule_fwd( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + beta: torch.Tensor, + scale: float, + h0_source: torch.Tensor, + output_final_state: bool, + use_qk_l2norm_in_kernel: bool, + cu_seqlens: torch.Tensor = None) -> tuple[torch.Tensor, torch.Tensor]: + ''' + Qwen3-NEXT模型中 Gated DeltaNet的核心算子, 将做完sigmoid_gating和delta_rule_update融合在一起 + 1. Sigmoid Gating: 对输入进行门控, 类似于 GLU (Gated Linear Unit)。 + 2. Delta Rule Update: 执行一个并行的状态空间模型(SSM)的递归更新, 同时结合了一个局部的注意力机制。 + ''' + + o, final_state = xtorch_ops.fused_recurrent_gated_delta_rule_fwd( + q, k, v, g, beta, scale, h0_source, output_final_state, use_qk_l2norm_in_kernel, + cu_seqlens) + return (o, final_state) \ No newline at end of file diff --git a/vllm_kunlun/ops/activation.py b/vllm_kunlun/ops/activation.py index f06da74..79dd552 100644 --- a/vllm_kunlun/ops/activation.py +++ b/vllm_kunlun/ops/activation.py @@ -1,8 +1,78 @@ # SPDX-License-Identifier: Apache-2.0 """Custom activation functions.""" +import math +from typing import Optional + import torch +import torch.nn as nn import torch.nn.functional as F + +from vllm.distributed import (divide, get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size) from vllm.model_executor.custom_op import CustomOp +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform +from vllm.utils import LazyDict + + +@CustomOp.register("kunlun_fatrelu_and_mul") +class FatreluAndMul(CustomOp): + """An activation function for FATReLU. + + The function computes x -> FATReLU(x[:d]) * x[d:] where + d = x.shape[-1] // 2. + This is used in openbmb/MiniCPM-S-1B-sft. + + Shapes: + x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d) + return: (num_tokens, d) or (batch_size, seq_len, d) + """ + + def __init__(self, threshold: float = 0.): + """ + Initializes the instance. + + Args: + threshold (float, optional): Threshold value for the filter. Defaults to 0.. + + Returns: + None: This method does not return anything. + """ + super().__init__() + self.threshold = threshold + + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + """ + 计算输入张量的正向传播,并返回一个新的张量。 + 该函数实现了原生的前向传播过程,即对输入张量进行阈值化处理后,将其乘以另一个张量。 + + Args: + x (torch.Tensor, shape=[*, d]): + 输入张量,其中*表示任意维度,d为特征维度。 + + Returns: + torch.Tensor, shape=[*, d]: + 返回一个新的张量,其形状与输入张量相同,除了最后一个维度被设置为d/2。 + 如果输入张量的最后一个维度小于等于d/2,则返回的张量将保持不变;否则,将对输入张量进行阈值化处理。 + """ + d = x.shape[-1] // 2 + x1 = x[..., :d] + x2 = x[..., d:] + x1 = F.threshold(x1, self.threshold, 0.0) + return x1 * x2 + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + """ + 在CUDA设备上执行前向传播。 + + Args: + x (torch.Tensor): 输入张量,形状为(N, C, H, W)。 + + Returns: + torch.Tensor: 输出张量,形状为(N, C, H, W)。 + """ + return self.forward_native(x) @CustomOp.register("kunlun_silu_and_mul") @@ -15,9 +85,532 @@ class SiluAndMul(CustomOp): x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d) return: (num_tokens, d) or (batch_size, seq_len, d) """ + + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + d = x.shape[-1] // 2 + return F.silu(x[..., :d]) * x[..., d:] + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + """forward_cuda""" + import xtorch_ops + d = x.shape[-1] // 2 output_shape = (x.shape[:-1] + (d, )) out = torch.empty(output_shape, dtype=x.dtype, device=x.device) torch.ops._C.swiglu(x, out) - return out \ No newline at end of file + return out + + def forward_kunlun(self, x: torch.Tensor) -> torch.Tensor: + """forward_kunlun""" + import xtorch_ops + + d = x.shape[-1] // 2 + output_shape = (x.shape[:-1] + (d, )) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + xtorch_ops.swiglu(x, out) + return out + + def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: + """ + Apply the function on `x` using XPU backend. + + Args: + x (torch.Tensor): Input tensor of any shape. Must be a floating point tensor. + The number of channels should be even. + + Returns: + torch.Tensor: Output tensor with the same shape as input except the last dimension is reduced by half. + It has the same dtype as the input and lives on the same device. + + Raises: + None + """ + from vllm._ipex_ops import ipex_ops as ops + + d = x.shape[-1] // 2 + output_shape = (x.shape[:-1] + (d, )) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + ops.silu_and_mul(out, x) + return out + + def forward_neuron(self, x: torch.Tensor) -> torch.Tensor: + """ + 前向传播一个神经元,计算输入的信号。 + 参数: + x (torch.Tensor): 形状为(-1, d)的张量,其中d是输入的维度。 + 每个元素表示一个输入信号。 + 返回值(torch.Tensor): + 形状为(-1, d)的张量,其中d是输出的维度。 + 每个元素表示一个输出信号。 + """ + d = x.shape[-1] // 2 + x_reshaped = x.view(-1, x.shape[-1]) + s = x_reshaped[:, :d] * F.sigmoid(x_reshaped[:, :d]) + result = s * x_reshaped[:, d:] + return result.view(*x.shape[:-1], d) + + +@CustomOp.register("kunlun_mul_and_silu") +class MulAndSilu(CustomOp): + """An activation function for SwiGLU. + + The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2. + + Shapes: + x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d) + return: (num_tokens, d) or (batch_size, seq_len, d) + """ + + def __init__(self): + """ + 初始化函数,用于实例化类的对象。 + 如果当前平台是 CUDA 或 XPU,则使用 torch.ops._C.mul_and_silu 进行操作; + 否则,如果当前平台是 CPU,则使用 forward_native 方法进行操作。 + """ + super().__init__() + if current_platform.is_cuda_alike(): + self.op = torch.ops._C.mul_and_silu + elif current_platform.is_xpu(): + from vllm._ipex_ops import ipex_ops + self.op = ipex_ops.silu_and_mul + elif current_platform.is_cpu(): + self._forward_method = self.forward_native + + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + d = x.shape[-1] // 2 + return x[..., :d] * F.silu(x[..., d:]) + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + """ + 在CUDA设备上执行前向传播操作。 + + Args: + x (torch.Tensor): 输入张量,其形状应为(..., d),其中d是特征维度。 + + Returns: + torch.Tensor: 输出张量,其形状与输入张量相同,但最后一个维度被替换为d/2。 + + Raises: + 无。 + """ + d = x.shape[-1] // 2 + output_shape = (x.shape[:-1] + (d, )) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + self.op(out, x) + return out + + # TODO implement forward_xpu for MulAndSilu + # def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: + + +@CustomOp.register("kunlun_gelu_and_mul") +class GeluAndMul(CustomOp): + """An activation function for GeGLU. + + The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2. + + Shapes: + x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d) + return: (batch_size, seq_len, d) or (num_tokens, d) + """ + + def __init__(self, approximate: str = "none"): + """ + Initializes the instance. + + Args: + approximate (str, optional): The approximation method to use. Defaults to "none". + Can be one of "none", "tanh". + + Raises: + ValueError: If the `approximate` parameter is not one of "none", "tanh". + """ + super().__init__() + self.approximate = approximate + if approximate not in ("none", "tanh"): + raise ValueError(f"Unknown approximate mode: {approximate}") + + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + d = x.shape[-1] // 2 + return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:] + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + """ + 在CUDA设备上进行前向传播。 + + Args: + x (torch.Tensor): 输入张量,形状为(batch_size, ..., dim),其中dim是特征维度。 + + Returns: + torch.Tensor: 输出张量,形状为(batch_size, ..., dim//2),其中dim是特征维度,除以2是因为GELU的输出是两个分量。 + + Raises: + 无。 + """ + # from vllm import _custom_ops as ops + import xtorch_ops + # d = x.shape[-1] // 2 + # output_shape = (x.shape[:-1] + (d, )) + out = torch.empty(x, dtype=x.dtype, device=x.device) + if self.approximate == "none": + # ops.gelu_and_mul(out, x) + print(x,x.shape) + xtorch_ops.gelu(x, out) + elif self.approximate == "tanh": + ops.gelu_tanh_and_mul(out, x) + return out + + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + d, _ = self._check_and_make_out(x) + # 保守地用 contiguous,避免 view 相关坑 + x = x.contiguous() + x1 = x[..., :d] + x2 = x[..., d:] + return F.gelu(x1, approximate=self.approximate) * x2 + + # def forward_native(self, x: torch.Tensor) -> torch.Tensor: + # """PyTorch-native implementation equivalent to forward().""" + # d = x.shape[-1] // 2 + # return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:] + + def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: + """ + Apply gelu activation function on input tensor using iPEX backend. + + Args: + x (torch.Tensor): Input tensor with shape (N, C, H, W). + The data type can be float32 or float64. + + Returns: + torch.Tensor: Output tensor with the same shape and data type as input. + The output will have a range of (-0.5, 0.5) for tanh approximation. + """ + from vllm._ipex_ops import ipex_ops as ops + + d = x.shape[-1] // 2 + output_shape = (x.shape[:-1] + (d, )) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + if self.approximate == "none": + ops.gelu_and_mul(out, x) + elif self.approximate == "tanh": + ops.gelu_tanh_and_mul(out, x) + return out + + def extra_repr(self) -> str: + """ + 返回一个字符串,包含有关模型的额外信息。这个函数可以被用于打印出模型的概要信息。 + 默认情况下,这个函数会返回一个包含模型是否使用近似值(approximate)的信息。 + + Returns: + str (str): 一个字符串,包含有关模型的额外信息。 + """ + return f'approximate={repr(self.approximate)}' + + +@CustomOp.register("kunlun_gelu_new") +class NewGELU(CustomOp): + + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + c = math.sqrt(2.0 / math.pi) + return 0.5 * x * (1.0 + torch.tanh(c * + (x + 0.044715 * torch.pow(x, 3.0)))) + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + """ + 计算CUDA上的GELU函数。 + + Args: + x (torch.Tensor): 输入张量,形状为(N, C, H, W)。 + + Returns: + torch.Tensor: GELU函数的结果,形状与输入相同。 + + Raises: + 无。 + """ + from vllm import _custom_ops as ops + + out = torch.empty_like(x) + ops.gelu_new(out, x) + return out + + def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: + """ + Apply the GELU activation function element-wise. + + Args: + x (torch.Tensor): Input tensor with any shape. The data type is float32 or float64. + + Returns: + torch.Tensor: Output tensor with the same shape as input. The data type is the same as input. + + Raises: + None + """ + from vllm._ipex_ops import ipex_ops as ops + + return ops.gelu_new(x) + + +@CustomOp.register("kunlun_gelu_fast") +class FastGELU(CustomOp): + + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * + (1.0 + 0.044715 * x * x))) + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + """ + 计算输入张量x的CUDA版本GELU(Gaussian Error Linear Unit)。 + 该函数调用了vllm模块中的_custom_ops模块中的gelu_fast函数,完成GELU操作。 + + Args: + x (torch.Tensor): 输入张量,形状为(N, C, H, W),类型为float32或float64。 + + Returns: + torch.Tensor: GELU后的输出张量,形状与x相同,类型与x相同。 + + Raises: + 无。 + """ + from vllm import _custom_ops as ops + + out = torch.empty_like(x) + ops.gelu_fast(out, x) + return out + + def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: + """ + Apply the GELU function element-wise on input tensor ``x``. + + Args: + x (torch.Tensor): Input tensor with any shape. The data type can be float or half float. + The range of the input values is expected to be -inf to inf. + + Returns: + torch.Tensor: Output tensor with the same shape and data type as input ``x``. + The output values are in the range [-0.5, 0.5] for float dtype and [-15, 15] for half float dtype. + + Raises: + TypeError: If the input ``x`` is not a torch.Tensor. + RuntimeError: If the input ``x`` contains non-finite numbers. + """ + from vllm._ipex_ops import ipex_ops as ops + + return ops.gelu_fast(x) + + +@CustomOp.register("kunlun_quick_gelu") +class QuickGELU(CustomOp): + # https://github.com/huggingface/transformers/blob/main/src/transformers/activations.py#L90 + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + return x * torch.sigmoid(1.702 * x) + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + """ + 使用CUDA设备进行前向计算。 + + Args: + x (torch.Tensor): 输入张量,形状为(N, C, H, W)。 + + Returns: + torch.Tensor: 输出张量,形状与输入相同,值为GELU函数的结果。 + + Raises: + 无。 + """ + from vllm import _custom_ops as ops + + out = torch.empty_like(x) + ops.gelu_quick(out, x) + return out + + def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: + """ + Apply the GELU function element-wise on input tensor ``x``. + + Args: + x (torch.Tensor): Input tensor with any shape. The data type is float32 or float64. + + Returns: + torch.Tensor: Output tensor with the same shape and data type as input ``x``. + + Raises: + None + """ + from vllm._ipex_ops import ipex_ops as ops + + out = torch.empty_like(x) + ops.gelu_quick(out, x) + return out + + def forward_kunlun(self, x: torch.Tensor) -> torch.Tensor: + """forward_kunlun""" + from vllm._kunlun_ops import KunlunOps as ops + out = torch.empty_like(x) + ops.quick_gelu(out, x) + return out + + +@CustomOp.register("kunlun_relu2") +class ReLUSquaredActivation(CustomOp): + """ + Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2 + """ + + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + return torch.square(F.relu(x)) + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + """ + 在CUDA设备上执行前向传播。 + + Args: + x (torch.Tensor): 输入张量,形状为(N, C, H, W),数据类型为float32或float64。 + + Returns: + torch.Tensor: 输出张量,形状与输入相同,数据类型与输入一致。 + + Raises: + 无。 + """ + return self.forward_native(x) + + +class ScaledActivation(nn.Module): + """An activation function with post-scale parameters. + + This is used for some quantization methods like AWQ. + """ + + def __init__( + self, + act_module: nn.Module, + intermediate_size: int, + input_is_parallel: bool = True, + params_dtype: Optional[torch.dtype] = None, + ): + """ + Initializes the LayerNorm module. + + Args: + act_module (nn.Module): The activation function to use after layer norm. + Default: nn.GELU() + intermediate_size (int): The size of the intermediate representation. + input_is_parallel (bool, optional): Whether the input is parallelly processed. + Default: True + params_dtype (Optional[torch.dtype], optional): The data type of parameters. + If None, use the default data type. Default: None + """ + super().__init__() + self.act = act_module + self.input_is_parallel = input_is_parallel + if input_is_parallel: + tp_size = get_tensor_model_parallel_world_size() + intermediate_size_per_partition = divide(intermediate_size, + tp_size) + else: + intermediate_size_per_partition = intermediate_size + if params_dtype is None: + params_dtype = torch.get_default_dtype() + self.scales = nn.Parameter( + torch.empty(intermediate_size_per_partition, dtype=params_dtype)) + set_weight_attrs(self.scales, {"weight_loader": self.weight_loader}) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + 前向传播函数,将输入的张量进行缩放和激活操作。 + + Args: + x (torch.Tensor): 输入张量,形状为(N, C, H, W)或者(N, C, H, W, D)。 + + Returns: + torch.Tensor: 返回处理后的张量,形状与输入相同。 + """ + return self.act(x) / self.scales + + def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor): + """ + 加载权重,如果输入是并行的,则需要将其平均分配到每个模型参数中。 + 参数: + param (nn.Parameter): 需要加载权重的模型参数。 + loaded_weight (torch.Tensor): 加载的权重张量。 + 返回值: + 无返回值,直接修改了param的数据。 + """ + param_data = param.data + if self.input_is_parallel: + tp_rank = get_tensor_model_parallel_rank() + shard_size = param_data.shape[0] + start_idx = tp_rank * shard_size + loaded_weight = loaded_weight.narrow(0, start_idx, shard_size) + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + + +_ACTIVATION_REGISTRY = LazyDict({ + "gelu": + lambda: nn.GELU(), + "gelu_fast": + lambda: FastGELU(), + "gelu_new": + lambda: NewGELU(), + "gelu_pytorch_tanh": + lambda: nn.GELU(approximate="tanh"), + "relu": + lambda: nn.ReLU(), + "relu2": + lambda: ReLUSquaredActivation(), + "silu": + lambda: nn.SiLU(), + "quick_gelu": + lambda: QuickGELU(), +}) + + +def get_act_fn( + act_fn_name: str, + quant_config: Optional[QuantizationConfig] = None, + intermediate_size: Optional[int] = None, + input_is_parallel: bool = True, + params_dtype: Optional[torch.dtype] = None, +) -> nn.Module: + """Get an activation function by name.""" + act_fn_name = act_fn_name.lower() + # print(f"activation function name: {act_fn_name}") + if act_fn_name not in _ACTIVATION_REGISTRY: + raise ValueError( + f"Activation function {act_fn_name!r} is not supported.") + + act_fn = _ACTIVATION_REGISTRY[act_fn_name] + if (quant_config is not None + and act_fn_name in quant_config.get_scaled_act_names()): + if intermediate_size is None: + raise ValueError("intermediate_size must be specified for scaled " + "activation functions.") + return ScaledActivation(act_fn, intermediate_size, input_is_parallel, + params_dtype) + return act_fn + +_ACTIVATION_AND_MUL_REGISTRY = LazyDict({ + "gelu": lambda: GeluAndMul(), + "silu": lambda: SiluAndMul(), + "geglu": lambda: GeluAndMul(), +}) + + +def get_act_and_mul_fn(act_fn_name: str) -> nn.Module: + """Get an activation-and-mul (i.e. SiluAndMul) function by name.""" + act_fn_name = act_fn_name.lower() + if act_fn_name not in _ACTIVATION_AND_MUL_REGISTRY: + raise ValueError( + f"Activation function {act_fn_name!r} is not supported.") + + return _ACTIVATION_AND_MUL_REGISTRY[act_fn_name] diff --git a/vllm_kunlun/ops/attention/backends/kunlun_attn.py b/vllm_kunlun/ops/attention/backends/kunlun_attn.py index 267073c..d8606dc 100644 --- a/vllm_kunlun/ops/attention/backends/kunlun_attn.py +++ b/vllm_kunlun/ops/attention/backends/kunlun_attn.py @@ -1,55 +1,28 @@ -# -# Copyright (c) 2025 Baidu, Inc. All Rights Reserved. -# Author: Bao Qian, Dong Xinyu, Chen Zhennan, Ma Tianyu -# Email: baoqian@baidu.com -# This file is a part of the vllm-kunlun project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. """kunlun attention wrapper for context and decode""" from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple, Type, TYPE_CHECKING import torch - if TYPE_CHECKING: from vllm.worker.model_runner import ModelInputForGPUBuilder from itertools import accumulate -from vllm.attention.backends.abstract import ( - AttentionBackend, - AttentionImpl, - AttentionMetadata, - AttentionType, -) -from .utils import CommonAttentionState, CommonMetadataBuilder -from vllm.attention.backends.utils import ( - is_block_tables_empty, - compute_slot_mapping_start_idx, - compute_slot_mapping, -) -from vllm_kunlun.ops.paged_attn import PagedAttention, PagedAttentionMetadata +from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionMetadata, AttentionType) +from .utils import (CommonAttentionState, CommonMetadataBuilder) +from vllm.attention.backends.utils import (is_block_tables_empty, + compute_slot_mapping_start_idx, compute_slot_mapping) +from vllm_kunlun.ops.paged_attn import (PagedAttention, + PagedAttentionMetadata) from vllm_kunlun.ops._kunlun_ops import KunlunOps from vllm.attention.backends.abstract import AttentionLayer from vllm.logger import init_logger from vllm.utils import async_tensor_h2d - logger = init_logger(__name__) class KunlunAttentionBackend(AttentionBackend): """KunlunAttentionBackend""" - accept_output_buffer = False - @staticmethod def get_name() -> str: return "KUNLUN_ATTENTION" @@ -80,9 +53,8 @@ class KunlunAttentionBackend(AttentionBackend): num_kv_heads: int, head_size: int, ) -> Tuple[int, ...]: - return PagedAttention.get_kv_cache_shape( - num_blocks, block_size, num_kv_heads, head_size - ) + return PagedAttention.get_kv_cache_shape(num_blocks, block_size, + num_kv_heads, head_size) @staticmethod def swap_blocks( @@ -182,6 +154,7 @@ class KunlunMetadata(AttentionMetadata, PagedAttentionMetadata): seq_lens_tensor_cpu: Optional[torch.Tensor] = None + def __post_init__(self): # Set during the execution of the first attention op. # It is a list because it is needed to set per prompt @@ -194,27 +167,23 @@ class KunlunMetadata(AttentionMetadata, PagedAttentionMetadata): @property def is_all_encoder_attn_metadata_set(self): - """ + ''' All attention metadata required for encoder attention is set. - """ - return ( - (self.encoder_seq_lens is not None) - and (self.encoder_seq_lens_tensor is not None) - and (self.max_encoder_seq_len is not None) - ) + ''' + return ((self.encoder_seq_lens is not None) + and (self.encoder_seq_lens_tensor is not None) + and (self.max_encoder_seq_len is not None)) @property def is_all_cross_attn_metadata_set(self): - """ + ''' All attention metadata required for enc/dec cross-attention is set. Superset of encoder attention required metadata. - """ - return ( - self.is_all_encoder_attn_metadata_set - and (self.cross_slot_mapping is not None) - and (self.cross_block_tables is not None) - ) + ''' + return (self.is_all_encoder_attn_metadata_set + and (self.cross_slot_mapping is not None) + and (self.cross_block_tables is not None)) @property def prefill_metadata(self) -> Optional["KunlunMetadata"]: @@ -227,60 +196,43 @@ class KunlunMetadata(AttentionMetadata, PagedAttentionMetadata): # metadata structure return self._cached_prefill_metadata - assert (self.seq_lens is not None) or (self.encoder_seq_lens is not None) - assert (self.seq_lens_tensor is not None) or ( - self.encoder_seq_lens_tensor is not None - ) + assert ((self.seq_lens is not None) + or (self.encoder_seq_lens is not None)) + assert ((self.seq_lens_tensor is not None) + or (self.encoder_seq_lens_tensor is not None)) # Compute some attn_metadata fields which default to None - query_start_loc = ( - None - if self.query_start_loc is None - else self.query_start_loc[: self.num_prefills + 1] - ) + query_start_loc = (None if self.query_start_loc is None else + self.query_start_loc[:self.num_prefills + 1]) # flash attention needs both lod information on host and device - query_start_loc_host = ( - None - if self.query_start_loc_host is None - else self.query_start_loc_host[: self.num_prefills + 1] - ) - kv_prefix_start_loc_host = ( - None - if self.kv_prefix_start_loc_host is None - else self.kv_prefix_start_loc_host[: self.num_prefills + 1] - + query_start_loc_host - ) - kv_prefix_start_loc = ( - None - if kv_prefix_start_loc_host is None - else kv_prefix_start_loc_host.cuda() - ) - slot_mapping = ( - None - if self.slot_mapping is None - else self.slot_mapping[: self.num_prefill_tokens] - ) - seq_lens = None if self.seq_lens is None else self.seq_lens[: self.num_prefills] - seq_lens_tensor = ( - None - if self.seq_lens_tensor is None - else self.seq_lens_tensor[: self.num_prefills] - ) - context_lens_tensor = ( - None - if self.context_lens_tensor is None - else self.context_lens_tensor[: self.num_prefills] - ) + query_start_loc_host = (None if self.query_start_loc_host is None else + self.query_start_loc_host[:self.num_prefills + 1]) + kv_prefix_start_loc_host = (None if self.kv_prefix_start_loc_host is None else + self.kv_prefix_start_loc_host[:self.num_prefills + 1] + query_start_loc_host) + kv_prefix_start_loc = (None if kv_prefix_start_loc_host is None else kv_prefix_start_loc_host.cuda()) + slot_mapping = (None if self.slot_mapping is None else + self.slot_mapping[:self.num_prefill_tokens]) + seq_lens = (None if self.seq_lens is None else + self.seq_lens[:self.num_prefills]) + seq_lens_tensor = (None if self.seq_lens_tensor is None else + self.seq_lens_tensor[:self.num_prefills]) + context_lens_tensor = (None if self.context_lens_tensor is None else + self.context_lens_tensor[:self.num_prefills]) + # for prefix cache, block table only contains blocks that hit + # if self.block_tables is None: + # block_tables = None + # elif self.block_tables.shape[1] == 0: + # block_tables = self.block_tables[:self.num_prefills] + # else: + # block_tables = self.block_tables[:self.num_prefills][:, -1].clone() - block_tables = ( - None - if self.block_tables is None - else self.block_tables[: self.num_prefills] - ) + block_tables = (None if self.block_tables is None else + self.block_tables[:self.num_prefills]) # Construct & cache prefill-phase attention metadata structure self._cached_prefill_metadata = KunlunMetadata( - multi_modal_placeholder_index_maps=self.multi_modal_placeholder_index_maps, + multi_modal_placeholder_index_maps=self. + multi_modal_placeholder_index_maps, num_prefills=self.num_prefills, num_prefill_tokens=self.num_prefill_tokens, num_decode_tokens=0, @@ -305,8 +257,7 @@ class KunlunMetadata(AttentionMetadata, PagedAttentionMetadata): cross_slot_mapping=self.cross_slot_mapping, cross_block_tables=self.cross_block_tables, enable_kv_scales_calculation=False, - seq_start_loc=self.seq_start_loc, - ) + seq_start_loc=self.seq_start_loc) return self._cached_prefill_metadata @property @@ -319,35 +270,25 @@ class KunlunMetadata(AttentionMetadata, PagedAttentionMetadata): # Recover cached decode-phase attention # metadata structure return self._cached_decode_metadata - assert (self.seq_lens_tensor is not None) or ( - self.encoder_seq_lens_tensor is not None - ) + assert ((self.seq_lens_tensor is not None) + or (self.encoder_seq_lens_tensor is not None)) # Compute some attn_metadata fields which default to None - slot_mapping = ( - None - if self.slot_mapping is None - else self.slot_mapping[self.num_prefill_tokens :] - ) - seq_lens_tensor = ( - None - if self.seq_lens_tensor is None - else self.seq_lens_tensor[self.num_prefills :] - ) - seq_lens_tensor_cpu = ( - None - if self.seq_lens_tensor_cpu is None - else self.seq_lens_tensor_cpu[self.num_prefills :] - ) - block_tables = ( - None - if self.block_tables is None - else self.block_tables[self.num_prefills :] - ) + slot_mapping = (None if self.slot_mapping is None else + self.slot_mapping[self.num_prefill_tokens:]) + seq_lens_tensor = (None if self.seq_lens_tensor is None else + self.seq_lens_tensor[self.num_prefills:]) + seq_lens_tensor_cpu = (None if self.seq_lens_tensor_cpu is None else + self.seq_lens_tensor_cpu[self.num_prefills:]) + block_tables = (None if self.block_tables is None else + self.block_tables[self.num_prefills:]) + + # Construct & cache decode-phase attention metadata structure self._cached_decode_metadata = KunlunMetadata( - multi_modal_placeholder_index_maps=self.multi_modal_placeholder_index_maps, + multi_modal_placeholder_index_maps=self. + multi_modal_placeholder_index_maps, num_prefills=0, num_prefill_tokens=0, num_decode_tokens=self.num_decode_tokens, @@ -364,16 +305,13 @@ class KunlunMetadata(AttentionMetadata, PagedAttentionMetadata): max_encoder_seq_len=self.max_encoder_seq_len, cross_slot_mapping=self.cross_slot_mapping, cross_block_tables=self.cross_block_tables, - enable_kv_scales_calculation=False, - ) + enable_kv_scales_calculation=False) return self._cached_decode_metadata class KunlunMetadataBuilder(CommonMetadataBuilder[KunlunMetadata]): """KunlunMetadataBuilder""" - _metadata_cls = KunlunMetadata - def __init__(self, input_builder: "ModelInputForGPUBuilder"): super().__init__(input_builder) self.prefix_cache_kv_lens: List[int] = [] @@ -382,120 +320,90 @@ class KunlunMetadataBuilder(CommonMetadataBuilder[KunlunMetadata]): """prepare""" super().prepare() self.prefix_cache_kv_lens = list() - def _add_seq_group( - self, - inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup", - chunked_prefill_enabled: bool, - ): + self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup", + chunked_prefill_enabled: bool): is_prompt = inter_data.is_prompt block_tables = inter_data.block_tables - for ( - seq_id, - token_len, - seq_len, - curr_seq_len, - query_len, - context_len, - curr_sliding_window_block, - ) in zip( - inter_data.seq_ids, - [len(t) for t in inter_data.input_tokens], - inter_data.orig_seq_lens, - inter_data.seq_lens, - inter_data.query_lens, - inter_data.context_lens, - inter_data.curr_sliding_window_blocks, - ): + for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len, + curr_sliding_window_block) in zip( + inter_data.seq_ids, [len(t) for t in inter_data.input_tokens], + inter_data.orig_seq_lens, inter_data.seq_lens, + inter_data.query_lens, inter_data.context_lens, + inter_data.curr_sliding_window_blocks): self.context_lens.append(context_len) if is_prompt: mm_maps = inter_data.multi_modal_placeholder_maps if mm_maps: for modality, placeholders in mm_maps.items(): - self.multimodal_placeholder_maps[modality].extend(placeholders) + self.multimodal_placeholder_maps[modality].extend( + placeholders) self.num_prefills += 1 self.num_prefill_tokens += token_len self.prefill_seq_lens.append(seq_len) else: - assert ( - query_len == 1 - ), "seq_len: {}, context_len: {}, query_len: {}".format( - seq_len, context_len, query_len - ) + assert query_len == 1, ( + "seq_len: {}, context_len: {}, query_len: {}".format( + seq_len, context_len, query_len)) self.num_decode_tokens += query_len self.curr_seq_lens.append(curr_seq_len) # Compute block table. block_table = [] - assert ( - not chunked_prefill_enabled - ), "chunk prefill not supported for kunlun attention" + assert not chunked_prefill_enabled, "chunk prefill not supported for kunlun attention" if inter_data.prefix_cache_hit: assert context_len != 0 assert context_len % self.block_size == 0 - block_table = block_tables[seq_id][: context_len // self.block_size] - elif (not is_prompt) and block_tables is not None: + # block_table = block_tables[seq_id] + block_table = block_tables[seq_id][:context_len // self.block_size] + elif ((not is_prompt) + and block_tables is not None): if curr_sliding_window_block == 0: block_table = block_tables[seq_id] else: - block_table = block_tables[seq_id][-curr_sliding_window_block:] + block_table = block_tables[seq_id][ + -curr_sliding_window_block:] self.block_tables.append(block_table) if is_prompt: self.prefix_cache_kv_lens.append(context_len) # Compute slot mapping. is_profile_run = is_block_tables_empty(block_tables) - start_idx = compute_slot_mapping_start_idx( - is_prompt, query_len, context_len, self.sliding_window - ) - compute_slot_mapping( - is_profile_run, - self.slot_mapping, - seq_id, - seq_len, - context_len, - start_idx, - self.block_size, - inter_data.block_tables, - ) + start_idx = compute_slot_mapping_start_idx(is_prompt, query_len, + context_len, + self.sliding_window) + compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id, + seq_len, context_len, start_idx, + self.block_size, inter_data.block_tables) - def build( - self, - seq_lens: List[int], - query_lens: List[int], - cuda_graph_pad_size: int, - batch_size: int, - ): + + def build(self, seq_lens: List[int], query_lens: List[int], + cuda_graph_pad_size: int, batch_size: int): """build""" attn_meta = super().build(seq_lens, query_lens, cuda_graph_pad_size, batch_size) query_start_loc = list(accumulate(query_lens, initial=0)) - query_start_loc_host = torch.tensor( - query_start_loc, dtype=torch.int32, device="cpu" - ) + query_start_loc_host = torch.tensor(query_start_loc, dtype=torch.int32, device='cpu') attn_meta.query_start_loc_host = query_start_loc_host + # max_kv_len = max(query_lens + prefix_cache_kv_lens) attn_meta.max_kv_len = max(self.prefix_cache_kv_lens + attn_meta.seq_lens) - - # If kv cache is included and there is a hit + # 包含kv cache ,且存在命中的情况 if len(self.prefix_cache_kv_lens) != 0 and max(self.prefix_cache_kv_lens) != 0: - self.prefix_cache_kv_lens = list( - accumulate(self.prefix_cache_kv_lens, initial=0) - ) - prefix_cache_kv_lens_tensor = torch.tensor( - self.prefix_cache_kv_lens, dtype=torch.int32, device="cpu" - ) + self.prefix_cache_kv_lens = list(accumulate(self.prefix_cache_kv_lens, initial=0)) + prefix_cache_kv_lens_tensor = torch.tensor(self.prefix_cache_kv_lens, dtype=torch.int32, device="cpu") attn_meta.kv_prefix_start_loc_host = prefix_cache_kv_lens_tensor attn_meta.seq_lens_tensor_cpu = attn_meta.seq_lens_tensor.to("cpu") return attn_meta + def _get_seq_len_block_table_args( attn_metadata: KunlunMetadata, is_prompt: bool, attn_type: AttentionType, ) -> tuple: - """ + ''' The particular choice of sequence-length- and block-table-related attributes which should be extracted from attn_metadata is dependent on the type of attention operation. @@ -517,7 +425,7 @@ def _get_seq_len_block_table_args( * Appropriate sequence-lengths tensor * Appropriate max sequence-length scalar * Appropriate block tables (or None) - """ + ''' if attn_type == AttentionType.DECODER: # Decoder self-attention @@ -526,26 +434,23 @@ def _get_seq_len_block_table_args( max_seq_len = attn_metadata.max_prefill_seq_len else: max_seq_len = attn_metadata.max_decode_seq_len - return (attn_metadata.seq_lens_tensor, max_seq_len, attn_metadata.block_tables) + return (attn_metadata.seq_lens_tensor, max_seq_len, + attn_metadata.block_tables) elif attn_type == AttentionType.ENCODER_DECODER: # Enc/dec cross-attention KVs match encoder sequence length; # cross-attention utilizes special "cross" block tables - return ( - attn_metadata.encoder_seq_lens_tensor, - attn_metadata.max_encoder_seq_len, - attn_metadata.cross_block_tables, - ) + return (attn_metadata.encoder_seq_lens_tensor, + attn_metadata.max_encoder_seq_len, + attn_metadata.cross_block_tables) elif attn_type == AttentionType.ENCODER: # No block tables associated with encoder attention - return ( - attn_metadata.encoder_seq_lens_tensor, - attn_metadata.max_encoder_seq_len, - None, - ) + return (attn_metadata.encoder_seq_lens_tensor, + attn_metadata.max_encoder_seq_len, None) else: raise AttributeError(f"Invalid attention type {str(attn_type)}") + class KunlunAttentionImpl(AttentionImpl[KunlunMetadata]): """KunlunAttentionImpl""" @@ -564,7 +469,8 @@ class KunlunAttentionImpl(AttentionImpl[KunlunMetadata]): kv_sharing_target_layer_name: Optional[str] = None, ) -> None: if blocksparse_params is not None: - raise ValueError("kunlunAttention does not support block-sparse attention.") + raise ValueError( + "kunlunAttention does not support block-sparse attention.") # if logits_soft_cap is not None: # raise ValueError( # "kunlunAttention does not support attention logits soft capping.") @@ -585,8 +491,8 @@ class KunlunAttentionImpl(AttentionImpl[KunlunMetadata]): if head_size not in suppored_head_sizes: raise ValueError( f"Head size {head_size} is not supported by PagedAttention. " - f"Supported head sizes are: {suppored_head_sizes}." - ) + f"Supported head sizes are: {suppored_head_sizes}.") + def forward( self, @@ -654,21 +560,16 @@ class KunlunAttentionImpl(AttentionImpl[KunlunMetadata]): # Check that appropriate attention metadata attributes are # selected for the desired attention type - if attn_type == AttentionType.ENCODER and ( - not attn_metadata.is_all_encoder_attn_metadata_set - ): - raise AttributeError( - "Encoder attention requires setting " "encoder metadata attributes." - ) + if (attn_type == AttentionType.ENCODER + and (not attn_metadata.is_all_encoder_attn_metadata_set)): + raise AttributeError("Encoder attention requires setting " + "encoder metadata attributes.") - elif attn_type == AttentionType.ENCODER_DECODER and ( - not attn_metadata.is_all_cross_attn_metadata_set - ): - raise AttributeError( - "Encoder/decoder cross-attention " - "requires setting cross-attention " - "metadata attributes." - ) + elif (attn_type == AttentionType.ENCODER_DECODER + and (not attn_metadata.is_all_cross_attn_metadata_set)): + raise AttributeError("Encoder/decoder cross-attention " + "requires setting cross-attention " + "metadata attributes.") query = query.view(-1, self.num_heads, self.head_size) if key is not None: @@ -682,7 +583,7 @@ class KunlunAttentionImpl(AttentionImpl[KunlunMetadata]): # which KV cache memory-mapping & which # seqlen datastructures we utilize - if attn_type != AttentionType.ENCODER and kv_cache.numel() > 0: + if (attn_type != AttentionType.ENCODER and kv_cache.numel() > 0): # KV-cache during decoder-self- or # encoder-decoder-cross-attention, but not # during encoder attention. @@ -691,8 +592,7 @@ class KunlunAttentionImpl(AttentionImpl[KunlunMetadata]): # we still need to break out key_cache and value_cache # i.e. for later use by paged attention key_cache, value_cache = PagedAttention.split_kv_cache( - kv_cache, self.num_kv_heads, self.head_size - ) + kv_cache, self.num_kv_heads, self.head_size) if (key is not None) and (value is not None): @@ -701,14 +601,10 @@ class KunlunAttentionImpl(AttentionImpl[KunlunMetadata]): else: updated_slot_mapping = attn_metadata.slot_mapping value = value.contiguous() - KunlunOps.reshape_and_cache( - key, - value, - key_cache, - value_cache, - updated_slot_mapping, - self.kv_cache_dtype, - ) + KunlunOps.reshape_and_cache(key, value, key_cache, + value_cache, + updated_slot_mapping, + self.kv_cache_dtype) if attn_type == AttentionType.ENCODER: # Encoder attention - chunked prefill is not applicable; @@ -753,20 +649,14 @@ class KunlunAttentionImpl(AttentionImpl[KunlunMetadata]): # Prompt run. if kv_cache.numel() == 0 or prefill_meta.block_tables.numel() == 0: out = KunlunOps.multi_query_kv_attention( - prefill_meta.query_start_loc, - prefill_meta.query_start_loc_host, - query, - key, - value, - alibi_slopes=self.alibi_slopes, - ).view_as(query) + prefill_meta.query_start_loc,prefill_meta.query_start_loc_host, query, key, value, + alibi_slopes=self.alibi_slopes).view_as(query) assert output[:num_prefill_tokens].shape == out.shape output[:num_prefill_tokens] = out if decode_meta := attn_metadata.decode_metadata: - assert ( - attn_type != AttentionType.ENCODER_ONLY - ), "Encoder-only models should not have decode metadata." + assert attn_type != AttentionType.ENCODER_ONLY, ( + "Encoder-only models should not have decode metadata.") ( seq_lens_arg, max_seq_len_arg, @@ -791,4 +681,4 @@ class KunlunAttentionImpl(AttentionImpl[KunlunMetadata]): ) # Reshape the output tensor. - return output.view(-1, self.num_heads * self.head_size) + return output.view(-1, self.num_heads * self.head_size) \ No newline at end of file diff --git a/vllm_kunlun/ops/attention/layer.py b/vllm_kunlun/ops/attention/layer.py index f5c60fd..3f5bf03 100644 --- a/vllm_kunlun/ops/attention/layer.py +++ b/vllm_kunlun/ops/attention/layer.py @@ -4,13 +4,12 @@ import torch import torch.nn.functional as F from typing import Optional, List, Dict, Any from vllm.attention import AttentionType -from vllm.distributed.kv_transfer import ( - get_kv_transfer_group, - has_kv_transfer_group, - is_v1_kv_transfer_group, -) +from vllm.distributed.kv_transfer import (get_kv_transfer_group, + has_kv_transfer_group, + is_v1_kv_transfer_group) from vllm.config import CacheConfig -from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) from vllm.forward_context import ForwardContext, get_forward_context @@ -20,10 +19,8 @@ from torch.library import custom_op, impl from vllm.platforms import _Backend - class Attention(VllmAttention): """Attention""" - def __init__( self, num_heads: int, @@ -75,8 +72,11 @@ class Attention(VllmAttention): if attn_metadata.enable_kv_scales_calculation: self.calc_kv_scales(query, key, value) if self.use_output: - output_shape = output_shape if output_shape is not None else query.shape - output = torch.zeros(output_shape, dtype=query.dtype, device=query.device) + output_shape = (output_shape + if output_shape is not None else query.shape) + output = torch.zeros(output_shape, + dtype=query.dtype, + device=query.device) hidden_size = output_shape[-1] # We skip reshaping query, key and value tensors for the MLA # backend since these tensors have different semantics and are @@ -97,13 +97,16 @@ class Attention(VllmAttention): if isinstance(attn_metadata, dict): attn_metadata = attn_metadata[self.layer_name] self_kv_cache = self.kv_cache[forward_context.virtual_engine] - self.impl.forward( - self, query, key, value, self_kv_cache, attn_metadata, output=output - ) + self.impl.forward(self, + query, + key, + value, + self_kv_cache, + attn_metadata, + output=output) else: torch.ops.vllm.unified_attention_with_output_kunlun( - query, key, value, output, self.layer_name - ) + query, key, value, output, self.layer_name) return output.view(-1, hidden_size) else: if self.use_direct_call: @@ -112,15 +115,13 @@ class Attention(VllmAttention): if isinstance(attn_metadata, dict): attn_metadata = attn_metadata[self.layer_name] self_kv_cache = self.kv_cache[forward_context.virtual_engine] - return self.impl.forward( - self, query, key, value, self_kv_cache, attn_metadata - ) + return self.impl.forward(self, query, key, value, + self_kv_cache, attn_metadata) else: - return unified_attention(query, key, value, self.layer_name) + return unified_attention( + query, key, value, self.layer_name) - -# -# Rewritten from the MultiHeadAttention class in vllm.attention.layer +# 重写自 vllm.attention.layer 中的 MultiHeadAttention 类 class MultiHeadAttention(VllmMultiHeadAttention): def __init__( self, @@ -130,15 +131,14 @@ class MultiHeadAttention(VllmMultiHeadAttention): num_kv_heads: Optional[int] = None, ): super().__init__( - num_heads=num_heads, - head_size=head_size, - scale=scale, - num_kv_heads=num_kv_heads, + num_heads = num_heads, + head_size = head_size, + scale = scale, + num_kv_heads = num_kv_heads, ) - - # kunlun only supports flash_attn + # kunlun只支持flash_attn self.attn_backend = _Backend.FLASH_ATTN - + def forward( self, query: torch.Tensor, @@ -159,31 +159,34 @@ class MultiHeadAttention(VllmMultiHeadAttention): key = torch.repeat_interleave(key, num_repeat, dim=2) value = torch.repeat_interleave(value, num_repeat, dim=2) - # kunlun only supports flash_attn + # kunlun只支持flash_attn if self.attn_backend == _Backend.FLASH_ATTN: from flash_attn import flash_attn_func - out = flash_attn_func(query, key, value, softmax_scale=self.scale) elif self.attn_backend == _Backend.XFORMERS: from xformers import ops as xops - out = xops.memory_efficient_attention_forward( - query, key, value, scale=self.scale - ) + out = xops.memory_efficient_attention_forward(query, + key, + value, + scale=self.scale) elif self.attn_backend == _Backend.TORCH_SDPA: - query, key, value = (x.transpose(1, 2) for x in (query, key, value)) - out = F.scaled_dot_product_attention(query, key, value, scale=self.scale) + query, key, value = (x.transpose(1, 2) + for x in (query, key, value)) + out = F.scaled_dot_product_attention(query, + key, + value, + scale=self.scale) out = out.transpose(1, 2) elif self.attn_backend == _Backend.PALLAS_VLLM_V1: - query, key, value = (x.transpose(1, 2) for x in (query, key, value)) + query, key, value = (x.transpose(1, 2) + for x in (query, key, value)) from torch_xla.experimental.custom_kernel import flash_attention - out = flash_attention(query, key, value, sm_scale=self.scale) out = out.transpose(1, 2) return out.reshape(bsz, q_len, -1) - def wait_for_kv_layer_from_connector(layer_name: str): """wait_for_kv_layer_from_connector""" if not has_kv_transfer_group() or not is_v1_kv_transfer_group(): @@ -198,10 +201,9 @@ def wait_for_kv_layer_from_connector(layer_name: str): assert isinstance(attn_metadata, dict) connector.wait_for_layer_load(layer_name) - def maybe_save_kv_layer_to_connector( - layer_name: str, kv_cache_layer: List[torch.Tensor] -): + layer_name: str, + kv_cache_layer: List[torch.Tensor]): """maybe_save_kv_layer_to_connector""" if not has_kv_transfer_group() or not is_v1_kv_transfer_group(): return @@ -213,8 +215,8 @@ def maybe_save_kv_layer_to_connector( if attn_metadata is None: return assert isinstance(attn_metadata, dict) - connector.save_kv_layer(layer_name, kv_cache_layer, attn_metadata[layer_name]) - + connector.save_kv_layer(layer_name, kv_cache_layer, + attn_metadata[layer_name]) @custom_op("vllm::unified_attention_with_output_kunlun", mutates_args=()) def unified_attention_with_output_kunlun( @@ -223,8 +225,7 @@ def unified_attention_with_output_kunlun( value: torch.Tensor, output: torch.Tensor, layer_name: str, - output_scale: Optional[torch.Tensor] = None, -) -> None: + output_scale: Optional[torch.Tensor] = None,) -> None: wait_for_kv_layer_from_connector(layer_name) forward_context: ForwardContext = get_forward_context() attn_metadata = forward_context.attn_metadata @@ -232,26 +233,26 @@ def unified_attention_with_output_kunlun( attn_metadata = attn_metadata[layer_name] self = forward_context.no_compile_layers[layer_name] kv_cache = self.kv_cache[forward_context.virtual_engine] - self.impl.forward(self, query, key, value, kv_cache, attn_metadata, output=output) + self.impl.forward(self, + query, + key, + value, + kv_cache, + attn_metadata, + output=output) maybe_save_kv_layer_to_connector(layer_name, kv_cache) - def _fake_unified_attention_with_output_kunlun( query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, output: torch.Tensor, layer_name: str, - output_scale: Optional[torch.Tensor] = None, -) -> None: + output_scale: Optional[torch.Tensor] = None,) -> None: return None - -unified_attention_with_output_kunlun.register_fake( - _fake_unified_attention_with_output_kunlun -) - +unified_attention_with_output_kunlun.register_fake(_fake_unified_attention_with_output_kunlun) def unified_attention( query: torch.Tensor, @@ -268,7 +269,8 @@ def unified_attention( attn_metadata = attn_metadata[layer_name] self = forward_context.no_compile_layers[layer_name] kv_cache = self.kv_cache[forward_context.virtual_engine] - output = self.impl.forward(self, query, key, value, kv_cache, attn_metadata) + output = self.impl.forward(self, query, key, value, kv_cache, + attn_metadata) maybe_save_kv_layer_to_connector(layer_name, kv_cache) - return output + return output \ No newline at end of file diff --git a/vllm_kunlun/ops/fla/__init__.py b/vllm_kunlun/ops/fla/__init__.py new file mode 100644 index 0000000..76cd910 --- /dev/null +++ b/vllm_kunlun/ops/fla/__init__.py @@ -0,0 +1,9 @@ +from .chunk import chunk_gated_delta_rule +from .fused_recurrent import fused_recurrent_gated_delta_rule +from .layernorm_guard import RMSNormGated +from .torch_fla import l2norm, torch_chunk_gated_delta_rule +__all__ = [ + "RMSNormGated", + "chunk_gated_delta_rule", + "fused_recurrent_gated_delta_rule", +] \ No newline at end of file diff --git a/vllm_kunlun/ops/fla/chunk.py b/vllm_kunlun/ops/fla/chunk.py new file mode 100644 index 0000000..01e074d --- /dev/null +++ b/vllm_kunlun/ops/fla/chunk.py @@ -0,0 +1,247 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang +# ruff: noqa: E501 +import warnings +from typing import Optional +import torch.nn.functional as F + +import torch +import torch.distributed as dist +from einops import rearrange + +from .chunk_delta_h import chunk_gated_delta_rule_fwd_h +from .chunk_o import chunk_fwd_o +from .chunk_scaled_dot_kkt import chunk_scaled_dot_kkt_fwd +from .cumsum import chunk_local_cumsum +from .l2norm import l2norm_fwd +from .solve_tril import solve_tril +from .utils import SUPPRESS_LEVEL, input_guard +from .wy_fast import recompute_w_u_fwd + + +def torch_solve_tril(A: torch.Tensor, cu_seqlens: Optional[torch.LongTensor] = None, output_dtype: torch.dtype = torch.float,): + chunk_size=64 + A = A.transpose(1,2) + sequence_length = A.shape[-2] + pad_size = (chunk_size - sequence_length % chunk_size) % chunk_size + A = F.pad(A, (0, 0, 0, pad_size)) + A = A.reshape(A.shape[0], A.shape[1], -1, chunk_size, A.shape[-1]) + mask = torch.triu(torch.ones(chunk_size, chunk_size, dtype=torch.bool, device=A.device), diagonal=0) + + A = A.masked_fill(mask, 0) + for i in range(1, chunk_size): + row = A[..., i, :i].clone() + sub = A[..., :i, :i].clone() + A[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2) + A = A + torch.eye(chunk_size, dtype=A.dtype, device=A.device) + return A.reshape(A.shape[0], A.shape[1], -1, A.shape[-1])[:,:,:sequence_length,:].transpose(1,2) + +def chunk_gated_delta_rule_fwd(q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + beta: torch.Tensor, + scale: float, + initial_state: torch.Tensor, + output_final_state: bool, + cu_seqlens: Optional[torch.LongTensor] = None): + g = chunk_local_cumsum(g, chunk_size=64, cu_seqlens=cu_seqlens) + A = chunk_scaled_dot_kkt_fwd(k=k, + beta=beta, + g_cumsum=g, + cu_seqlens=cu_seqlens, + output_dtype=q.dtype) + + #torch版 + for i in range(len(cu_seqlens)-1): + A_i = A[:, cu_seqlens[i]:cu_seqlens[i+1], :, :] + A[:, cu_seqlens[i]:cu_seqlens[i+1], :, :] = torch_solve_tril(A=A_i, cu_seqlens=torch.tensor([0, cu_seqlens[i+1]-cu_seqlens[i]], device=q.device), output_dtype=k.dtype) + w, u = recompute_w_u_fwd( + k=k, + v=v, + beta=beta, + A=A, + g_cumsum=g, + cu_seqlens=cu_seqlens, + ) + h, v_new, final_state = chunk_gated_delta_rule_fwd_h( + k=k, + w=w, + u=u, + g=g, + initial_state=initial_state, + output_final_state=output_final_state, + cu_seqlens=cu_seqlens, + ) + o = chunk_fwd_o( + q=q, + k=k, + v=v_new, + h=h, + g=g, + scale=scale, + cu_seqlens=cu_seqlens, + ) + if SUPPRESS_LEVEL < 3: + return g, o, A, final_state, None, None, None + elif SUPPRESS_LEVEL >= 3: + return g, o, A, final_state, w, h, v_new + + +class ChunkGatedDeltaRuleFunction(torch.autograd.Function): + + @staticmethod + @input_guard + @torch.amp.custom_fwd(device_type='cuda') + def forward(ctx, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + beta: torch.Tensor, + scale: float, + initial_state: torch.Tensor, + output_final_state: bool, + cu_seqlens: Optional[torch.LongTensor] = None, + use_qk_l2norm_in_kernel: bool = False): + if use_qk_l2norm_in_kernel: + q = l2norm_fwd(q) + k = l2norm_fwd(k) + + g, o, A, final_state, w, h, v_new = chunk_gated_delta_rule_fwd( + q=q, + k=k, + v=v, + g=g, + beta=beta, + scale=scale, + initial_state=initial_state, + output_final_state=output_final_state, + cu_seqlens=cu_seqlens, + ) + ctx.scale = scale + ctx.use_qk_l2norm_in_kernel = use_qk_l2norm_in_kernel + return o.to(q.dtype), final_state + + +@torch.compiler.disable +def chunk_gated_delta_rule(q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + beta: torch.Tensor, + scale: float = None, + initial_state: torch.Tensor = None, + output_final_state: bool = False, + cu_seqlens: Optional[torch.LongTensor] = None, + head_first: bool = False, + use_qk_l2norm_in_kernel: bool = False): + r""" + Args: + q (torch.Tensor): + queries of shape `[B, T, H, K]` if `head_first=False` else `[B, H, T, K]`. + k (torch.Tensor): + keys of shape `[B, T, H, K]` if `head_first=False` else `[B, H, T, K]`. + v (torch.Tensor): + values of shape `[B, T, H, V]` if `head_first=False` else `[B, H, T, V]`. + g (torch.Tensor): + (forget) gating tensor (in log space!) of shape `[B, T, H]` if `head_first=False` else `[B, H, T]`. + beta (torch.Tensor): + betas of shape `[B, T, H]` if `head_first=False` else `[B, H, T]`. + scale (Optional[int]): + Scale factor for the RetNet attention scores. + If not provided, it will default to `1 / sqrt(K)`. Default: `None`. + initial_state (Optional[torch.Tensor]): + Initial state of shape `[N, H, K, V]` for `N` input sequences. + For equal-length input sequences, `N` equals the batch size `B`. + Default: `None`. + output_final_state (Optional[bool]): + Whether to output the final state of shape `[N, H, K, V]`. Default: `False`. + cu_seqlens (torch.LongTensor): + Cumulative sequence lengths of shape `[N+1]` used for variable-length training, + consistent with the FlashAttention API. + head_first (Optional[bool]): + Whether the inputs are in the head-first format, which is not supported for variable-length inputs. + Default: `False`. + + Returns: + o (torch.Tensor): + Outputs of shape `[B, T, H, V]` if `head_first=False` else `[B, H, T, V]`. + final_state (torch.Tensor): + Final state of shape `[N, H, K, V]` if `output_final_state=True` else `None`. + + Examples:: + >>> import torch + >>> import torch.nn.functional as F + >>> from einops import rearrange + >>> from fla.ops.gated_delta_rule import chunk_gated_delta_rule + # inputs with equal lengths + >>> B, T, H, K, V = 4, 2048, 4, 512, 512 + >>> q = torch.randn(B, T, H, K, dtype=torch.bfloat16, device='cuda') + >>> k = F.normalize(torch.randn(B, T, H, K, dtype=torch.bfloat16, device='cuda'), p=2, dim=-1) + >>> v = torch.randn(B, T, H, V, dtype=torch.bfloat16, device='cuda') + >>> beta = torch.rand(B, T, H, dtype=torch.bfloat16, device='cuda').sigmoid() + >>> g = F.logsigmoid(torch.rand(B, T, H, dtype=torch.bfloat16, device='cuda')) + >>> h0 = torch.randn(B, H, K, V, dtype=torch.bfloat16, device='cuda') + >>> o, ht = chunk_gated_delta_rule( + q, k, v, g, beta, + initial_state=h0, + output_final_state=True + ) + # for variable-length inputs, the batch size `B` is expected to be 1 and `cu_seqlens` is required + >>> q, k, v, beta, g = map(lambda x: rearrange(x, 'b t ... -> 1 (b t) ...'), (q, k, v, beta, g)) + # for a batch with 4 sequences, `cu_seqlens` with 5 start/end positions are expected + >>> cu_seqlens = q.new_tensor([0, 2048, 4096, 6144, 8192], dtype=torch.long) + >>> o_var, ht_var = chunk_gated_delta_rule( + q, k, v, g, beta, + initial_state=h0, + output_final_state=True, + cu_seqlens=cu_seqlens + ) + """ + assert q.dtype == k.dtype == v.dtype + assert q.dtype != torch.float32, "ChunkGatedDeltaRuleFunction does not support float32. Please use bfloat16." + assert len( + beta.shape + ) == 3, "beta must be of shape [B, T, H] if head_first=False, or [B, H, T] otherwise." + + if head_first: + raise DeprecationWarning( + "head_first is deprecated and will be removed in a future version. " + "Please use head_first=False for now instead.", + stacklevel=2) + q, k, v, beta, g = map( + lambda x: rearrange(x, 'b h t ... -> b t h ...'), + (q, k, v, beta, g)) + if not head_first and q.shape[1] < q.shape[2]: + warnings.warn( + f"Input tensor shape suggests potential format mismatch: seq_len ({q.shape[1]}) < num_heads ({q.shape[2]}). " + "This may indicate the inputs were passed in head-first format [B, H, T, ...] " + "when head_first=False was specified. " + "Please verify your input tensor format matches the expected shape [B, T, H, ...].", + stacklevel=2) + if cu_seqlens is not None: + if q.shape[0] != 1: + raise ValueError( + f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`." + f"Please flatten variable-length inputs before processing.") + if initial_state is not None and initial_state.shape[0] != len( + cu_seqlens) - 1: + raise ValueError( + f"The number of initial states is expected to be equal to the number of input sequences, " + f"i.e., {len(cu_seqlens) - 1} rather than {initial_state.shape[0]}." + ) + if scale is None: + scale = k.shape[-1]**-0.5 + o, final_state = ChunkGatedDeltaRuleFunction.apply( + q, k, v, g, beta, scale, initial_state, output_final_state, cu_seqlens, + use_qk_l2norm_in_kernel) + if head_first: + o = rearrange(o, 'b t h ... -> b h t ...') + return o, final_state diff --git a/vllm_kunlun/ops/fla/chunk_delta_h.py b/vllm_kunlun/ops/fla/chunk_delta_h.py new file mode 100644 index 0000000..ece792b --- /dev/null +++ b/vllm_kunlun/ops/fla/chunk_delta_h.py @@ -0,0 +1,251 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang +# ruff: noqa: E501 +from typing import Optional + +import torch + +from vllm.triton_utils import tl, triton + +from .index import prepare_chunk_indices, prepare_chunk_offsets +from .op import exp +from .utils import is_nvidia_hopper, use_cuda_graph + +NUM_WARPS = [2, 4] if is_nvidia_hopper else [2, 4, 8, 16] + + +@triton.heuristics( + { + "USE_G": lambda args: args["g"] is not None, + "USE_INITIAL_STATE": lambda args: args["h0"] is not None, + "STORE_FINAL_STATE": lambda args: args["ht"] is not None, + "SAVE_NEW_VALUE": lambda args: args["v_new"] is not None, + "IS_VARLEN": lambda args: args["cu_seqlens"] is not None, + } +) +@triton.jit(do_not_specialize=["T"]) +def chunk_gated_delta_rule_fwd_kernel_h_blockdim64( + k, + v, + w, + v_new, + g, + h, + h0, + ht, + cu_seqlens, + chunk_offsets, + T, + H: tl.constexpr, + Hg: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + BT: tl.constexpr, + BV: tl.constexpr, + USE_G: tl.constexpr, + USE_INITIAL_STATE: tl.constexpr, + STORE_FINAL_STATE: tl.constexpr, + SAVE_NEW_VALUE: tl.constexpr, + IS_VARLEN: tl.constexpr, +): + i_v, i_nh = tl.program_id(0), tl.program_id(1) + i_n, i_h = i_nh // H, i_nh % H + + if IS_VARLEN: + bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32) + T = eos - bos + NT = tl.cdiv(T, BT) + boh = tl.load(chunk_offsets + i_n).to(tl.int32) + else: + bos, eos = i_n * T, i_n * T + T + NT = tl.cdiv(T, BT) + boh = i_n * NT + + # [BK, BV] + b_h1 = tl.zeros([64, BV], dtype=tl.float32) + if K > 64: + b_h2 = tl.zeros([64, BV], dtype=tl.float32) + if K > 128: + b_h3 = tl.zeros([64, BV], dtype=tl.float32) + if K > 192: + b_h4 = tl.zeros([64, BV], dtype=tl.float32) + + # calculate offset + h += (boh * H + i_h) * K * V + v += (bos * H + i_h) * V + k += (bos * Hg + i_h // (H // Hg)) * K + w += (bos * H + i_h) * K + if SAVE_NEW_VALUE: + v_new += (bos * H + i_h) * V + stride_v = H * V + stride_h = H * K * V + stride_k = Hg * K + stride_w = H * K + if USE_INITIAL_STATE: + h0 = h0 + i_nh * K * V + if STORE_FINAL_STATE: + ht = ht + i_nh * K * V + + # load initial state + if USE_INITIAL_STATE: + p_h0_1 = tl.make_block_ptr(h0, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0)) + b_h1 += tl.load(p_h0_1, boundary_check=(0, 1)).to(tl.float32) + if K > 64: + p_h0_2 = tl.make_block_ptr(h0, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0)) + b_h2 += tl.load(p_h0_2, boundary_check=(0, 1)).to(tl.float32) + if K > 128: + p_h0_3 = tl.make_block_ptr(h0, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0)) + b_h3 += tl.load(p_h0_3, boundary_check=(0, 1)).to(tl.float32) + if K > 192: + p_h0_4 = tl.make_block_ptr(h0, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0)) + b_h4 += tl.load(p_h0_4, boundary_check=(0, 1)).to(tl.float32) + + # main recurrence + for i_t in range(NT): + p_h1 = tl.make_block_ptr(h + i_t * stride_h, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0)) + tl.store(p_h1, b_h1.to(p_h1.dtype.element_ty), boundary_check=(0, 1)) + if K > 64: + p_h2 = tl.make_block_ptr(h + i_t * stride_h, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0)) + tl.store(p_h2, b_h2.to(p_h2.dtype.element_ty), boundary_check=(0, 1)) + if K > 128: + p_h3 = tl.make_block_ptr(h + i_t * stride_h, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0)) + tl.store(p_h3, b_h3.to(p_h3.dtype.element_ty), boundary_check=(0, 1)) + if K > 192: + p_h4 = tl.make_block_ptr(h + i_t * stride_h, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0)) + tl.store(p_h4, b_h4.to(p_h4.dtype.element_ty), boundary_check=(0, 1)) + + p_v = tl.make_block_ptr(v, (T, V), (stride_v, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + p_v_new = ( + tl.make_block_ptr(v_new, (T, V), (stride_v, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + if SAVE_NEW_VALUE + else None + ) + b_v_new = tl.zeros([BT, BV], dtype=tl.float32) + p_w = tl.make_block_ptr(w, (T, K), (stride_w, 1), (i_t * BT, 0), (BT, 64), (1, 0)) + b_w = tl.load(p_w, boundary_check=(0, 1)) + b_v_new += tl.dot(b_w, b_h1.to(b_w.dtype)) + if K > 64: + p_w = tl.make_block_ptr(w, (T, K), (stride_w, 1), (i_t * BT, 64), (BT, 64), (1, 0)) + b_w = tl.load(p_w, boundary_check=(0, 1)) + b_v_new += tl.dot(b_w, b_h2.to(b_w.dtype)) + if K > 128: + p_w = tl.make_block_ptr(w, (T, K), (stride_w, 1), (i_t * BT, 128), (BT, 64), (1, 0)) + b_w = tl.load(p_w, boundary_check=(0, 1)) + b_v_new += tl.dot(b_w, b_h3.to(b_w.dtype)) + if K > 192: + p_w = tl.make_block_ptr(w, (T, K), (stride_w, 1), (i_t * BT, 192), (BT, 64), (1, 0)) + b_w = tl.load(p_w, boundary_check=(0, 1)) + b_v_new += tl.dot(b_w, b_h4.to(b_w.dtype)) + + b_v_new = -b_v_new + tl.load(p_v, boundary_check=(0, 1)) + + if SAVE_NEW_VALUE: + p_v_new = tl.make_block_ptr(v_new, (T, V), (stride_v, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + tl.store(p_v_new, b_v_new.to(p_v_new.dtype.element_ty), boundary_check=(0, 1)) + + if USE_G: + m_t = (i_t * BT + tl.arange(0, BT)) < T + last_idx = min((i_t + 1) * BT, T) - 1 + b_g_last = tl.load(g + bos * H + last_idx * H + i_h) + p_g = tl.make_block_ptr(g + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,)) + b_g = tl.load(p_g, boundary_check=(0,)) + b_v_new = b_v_new * tl.where(m_t, tl.exp(b_g_last - b_g), 0)[:, None] + b_g_last = tl.exp(b_g_last) + b_h1 = b_h1 * b_g_last + if K > 64: + b_h2 = b_h2 * b_g_last + if K > 128: + b_h3 = b_h3 * b_g_last + if K > 192: + b_h4 = b_h4 * b_g_last + b_v_new = b_v_new.to(k.dtype.element_ty) + p_k = tl.make_block_ptr(k, (K, T), (1, stride_k), (0, i_t * BT), (64, BT), (0, 1)) + b_k = tl.load(p_k, boundary_check=(0, 1)) + b_h1 += tl.dot(b_k, b_v_new) + if K > 64: + p_k = tl.make_block_ptr(k, (K, T), (1, stride_k), (64, i_t * BT), (64, BT), (0, 1)) + b_k = tl.load(p_k, boundary_check=(0, 1)) + b_h2 += tl.dot(b_k, b_v_new) + if K > 128: + p_k = tl.make_block_ptr(k, (K, T), (1, stride_k), (128, i_t * BT), (64, BT), (0, 1)) + b_k = tl.load(p_k, boundary_check=(0, 1)) + b_h3 += tl.dot(b_k, b_v_new) + if K > 192: + p_k = tl.make_block_ptr(k, (K, T), (1, stride_k), (192, i_t * BT), (64, BT), (0, 1)) + b_k = tl.load(p_k, boundary_check=(0, 1)) + b_h4 += tl.dot(b_k, b_v_new) + + # epilogue + if STORE_FINAL_STATE: + p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0)) + tl.store(p_ht, b_h1.to(p_ht.dtype.element_ty), boundary_check=(0, 1)) + if K > 64: + p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0)) + tl.store(p_ht, b_h2.to(p_ht.dtype.element_ty), boundary_check=(0, 1)) + if K > 128: + p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0)) + tl.store(p_ht, b_h3.to(p_ht.dtype.element_ty), boundary_check=(0, 1)) + if K > 192: + p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0)) + tl.store(p_ht, b_h4.to(p_ht.dtype.element_ty), boundary_check=(0, 1)) + + +def chunk_gated_delta_rule_fwd_h( + k: torch.Tensor, + w: torch.Tensor, + u: torch.Tensor, + g: Optional[torch.Tensor] = None, + initial_state: Optional[torch.Tensor] = None, + output_final_state: bool = False, + chunk_size: int = 64, # SY: remove this argument and force chunk size 64? + save_new_value: bool = True, + cu_seqlens: Optional[torch.LongTensor] = None, +) -> tuple[torch.Tensor, torch.Tensor]: + B, T, Hg, K, V = *k.shape, u.shape[-1] + H = u.shape[-2] + BT = chunk_size + + chunk_indices = prepare_chunk_indices( + cu_seqlens, chunk_size) if cu_seqlens is not None else None + # N: the actual number of sequences in the batch with either equal or variable lengths + if cu_seqlens is None: + N, NT, chunk_offsets = B, triton.cdiv(T, BT), None + else: + N, NT, chunk_offsets = len(cu_seqlens) - 1, len( + chunk_indices), prepare_chunk_offsets(cu_seqlens, BT) + assert K <= 256, "current kernel does not support head dimension larger than 256." + + h = k.new_empty(B, NT, H, K, V) + final_state = k.new_empty( + N, H, K, V, dtype=torch.float32) if output_final_state else None + + v_new = torch.empty_like(u) if save_new_value else None + + def grid(meta): + return (triton.cdiv(V, meta['BV']), N * H) + chunk_gated_delta_rule_fwd_kernel_h_blockdim64[grid]( + k=k, + v=u, + w=w, + v_new=v_new, + g=g, + h=h, + h0=initial_state, + ht=final_state, + cu_seqlens=cu_seqlens, + chunk_offsets=chunk_offsets, + T=T, + H=H, + Hg=Hg, + K=K, + V=V, + BT=BT, + BV=64, + ) + return h, v_new, final_state \ No newline at end of file diff --git a/vllm_kunlun/ops/fla/chunk_o.py b/vllm_kunlun/ops/fla/chunk_o.py new file mode 100644 index 0000000..f861ffc --- /dev/null +++ b/vllm_kunlun/ops/fla/chunk_o.py @@ -0,0 +1,180 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang + +# ruff: noqa: E501 + +from typing import Optional + +import torch + +from vllm.triton_utils import tl, triton + +from .index import prepare_chunk_indices +from .op import exp +from .utils import FLA_GDN_FIX_BT, check_shared_mem, is_nvidia_hopper + +BKV_LIST = [64, 128] if check_shared_mem() else [32, 64] +NUM_WARPS = [2, 4] if is_nvidia_hopper else [2, 4, 8] + + +@triton.heuristics({ + 'USE_G': lambda args: args['g'] is not None, + 'IS_VARLEN': lambda args: args['cu_seqlens'] is not None +}) +# @triton.autotune( +# configs=[ +# triton.Config({ +# 'BK': BK, +# 'BV': BV +# }, +# num_warps=num_warps, +# num_stages=num_stages) for BK in BKV_LIST +# for BV in BKV_LIST for num_warps in NUM_WARPS +# for num_stages in [2, 3, 4] +# ], +# key=['H', 'K', 'V', 'BT'], +# ) +@triton.jit(do_not_specialize=['T']) +def chunk_fwd_kernel_o( + q, + k, + v, + h, + g, + o, + cu_seqlens, + chunk_indices, + scale, + T, + H: tl.constexpr, + Hg: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + BT: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + USE_G: tl.constexpr, + IS_VARLEN: tl.constexpr, +): + i_v, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_b, i_h = i_bh // H, i_bh % H + + if IS_VARLEN: + i_tg = i_t + i_n, i_t = tl.load(chunk_indices + i_t * 2).to( + tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32) + bos, eos = tl.load(cu_seqlens + i_n).to( + tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32) + T = eos - bos + NT = tl.cdiv(T, BT) + else: + NT = tl.cdiv(T, BT) + i_tg = i_b * NT + i_t + bos, eos = i_b * T, i_b * T + T + + # offset calculation + q += (bos * Hg + i_h // (H // Hg)) * K + k += (bos * Hg + i_h // (H // Hg)) * K + v += (bos * H + i_h) * V + o += (bos * H + i_h) * V + h += (i_tg * H + i_h).to(tl.int64) * K * V + + b_o = tl.zeros([BT, BV], dtype=tl.float32) + b_A = tl.zeros([BT, BT], dtype=tl.float32) + + for i_k in range(tl.cdiv(K, BK)): + p_q = tl.make_block_ptr(q, (T, K), (Hg * K, 1), (i_t * BT, i_k * BK), + (BT, BK), (1, 0)) + p_k = tl.make_block_ptr(k, (K, T), (1, Hg * K), (i_k * BK, i_t * BT), + (BK, BT), (0, 1)) + p_h = tl.make_block_ptr(h, (K, V), (V, 1), (i_k * BK, i_v * BV), + (BK, BV), (1, 0)) + # [BT, BK] + b_q = tl.load(p_q, boundary_check=(0, 1)) + # [BK, BT] + b_k = tl.load(p_k, boundary_check=(0, 1)) + # [BK, BV] + b_h = tl.load(p_h, boundary_check=(0, 1)) + + # [BT, BK] @ [BK, BV] -> [BT, BV] + b_o += tl.dot(b_q, b_h) + # [BT, BK] @ [BK, BT] -> [BT, BT] + b_A += tl.dot(b_q, b_k) + + if USE_G: + g += bos * H + i_h + p_g = tl.make_block_ptr(g, (T, ), (H, ), (i_t * BT, ), (BT, ), (0, )) + b_g = tl.load(p_g, boundary_check=(0, )) + b_o = b_o * tl.exp(b_g)[:, None] + b_A = b_A * tl.exp(b_g[:, None] - b_g[None, :]) + + o_t = i_t * BT + tl.arange(0, BT) + # m_t = o_t < T + # m_A = (o_t[:, None] >= o_t[None, :]) & (m_t[:, None] & m_t) + # b_A = tl.where(m_A, b_A, 0) + b_A = tl.where(o_t[:, None] >= o_t[None, :], b_A, 0) + + p_v = tl.make_block_ptr(v, (T, V), (H * V, 1), (i_t * BT, i_v * BV), + (BT, BV), (1, 0)) + p_o = tl.make_block_ptr(o, (T, V), (H * V, 1), (i_t * BT, i_v * BV), + (BT, BV), (1, 0)) + b_v = tl.load(p_v, boundary_check=(0, 1)) + + # to fix mma -> mma layout conversion + # already solved by triton v3.2 or higher + b_o = b_o * scale + tl.dot(b_A.to(b_v.dtype), b_v) * scale + tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1)) + + +def chunk_fwd_o( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + h: torch.Tensor, + g: Optional[torch.Tensor] = None, # cumsum of log decay + scale: Optional[float] = None, + cu_seqlens: Optional[torch.LongTensor] = None, + chunk_size: int = 64) -> torch.Tensor: + B, T, Hg, K, V = *q.shape, v.shape[-1] + H = v.shape[-2] + if FLA_GDN_FIX_BT: + BT = 64 + else: + BT = min(chunk_size, max(16, triton.next_power_of_2(T))) + chunk_indices = prepare_chunk_indices( + cu_seqlens, BT) if cu_seqlens is not None else None + NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices) + if scale is None: + scale = k.shape[-1]**-0.5 + + o = torch.empty_like(v) + + def grid(meta): + return (triton.cdiv(V, meta['BV']), NT, B * H) + + chunk_fwd_kernel_o[grid]( + q, + k, + v, + h, + g, + o, + cu_seqlens, + chunk_indices, + scale, + T=T, + H=H, + Hg=Hg, + K=K, + V=V, + BT=BT, + BK=64, + BV=32 + ) + return o diff --git a/vllm_kunlun/ops/fla/chunk_scaled_dot_kkt.py b/vllm_kunlun/ops/fla/chunk_scaled_dot_kkt.py new file mode 100644 index 0000000..8006c80 --- /dev/null +++ b/vllm_kunlun/ops/fla/chunk_scaled_dot_kkt.py @@ -0,0 +1,144 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang +# ruff: noqa: E501 +from typing import Optional + +import torch + +from vllm.triton_utils import tl, triton + +from .index import prepare_chunk_indices +from .op import exp + + + + +@triton.heuristics({ + 'IS_VARLEN': lambda args: args['cu_seqlens'] is not None, + 'USE_G': lambda args: args['g_cumsum'] is not None +}) +# @triton.autotune( +# configs=[ +# triton.Config({'BK': BK}, num_warps=num_warps, num_stages=num_stages) +# for BK in [32, 64, 128] for num_warps in [2, 4, 8] +# for num_stages in [2, 3, 4] +# ], +# key=['H', 'K', 'BT', 'IS_VARLEN'], +# ) +@triton.jit(do_not_specialize=['T']) +def chunk_scaled_dot_kkt_fwd_kernel( + k, + beta, + g_cumsum, + A, + cu_seqlens, + chunk_indices, + T, + H: tl.constexpr, + Hg: tl.constexpr, + K: tl.constexpr, + BT: tl.constexpr, + BK: tl.constexpr, + IS_VARLEN: tl.constexpr, + USE_G: tl.constexpr, +): + i_t, i_bh = tl.program_id(0), tl.program_id(1) + i_b, i_h = i_bh // H, i_bh % H + if IS_VARLEN: + i_n, i_t = tl.load(chunk_indices + i_t * 2).to( + tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32) + bos, eos = tl.load(cu_seqlens + i_n).to( + tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32) + T = eos - bos + else: + bos, eos = i_b * T, i_b * T + T + o_t = i_t * BT + tl.arange(0, BT) + #m_t = o_t < T + + p_beta = tl.make_block_ptr(beta + bos * H + i_h, (T, ), (H, ), + (i_t * BT, ), (BT, ), (0, )) + b_beta = tl.load(p_beta, boundary_check=(0, )) + + b_A = tl.zeros([BT, BT], dtype=tl.float32) + for i_k in range(tl.cdiv(K, BK)): + p_k = tl.make_block_ptr(k + (bos * Hg + i_h // (H // Hg)) * K, (T, K), + (Hg * K, 1), (i_t * BT, i_k * BK), (BT, BK), + (1, 0)) + b_k = tl.load(p_k, boundary_check=(0, 1)) + b_kb = b_k * b_beta[:, None] + b_A += tl.dot(b_kb.to(b_k.dtype), tl.trans(b_k)) + + if USE_G: + p_g = tl.make_block_ptr(g_cumsum + bos * H + i_h, (T, ), (H, ), + (i_t * BT, ), (BT, ), (0, )) + b_g = tl.load(p_g, boundary_check=(0, )) + b_g_diff = b_g[:, None] - b_g[None, :] + b_A = b_A * tl.exp(b_g_diff) # 使用了triton而非vllm中的exp + + #m_A = (o_t[:, None] > o_t[None, :]) & (m_t[:, None] & m_t) + #b_A = tl.where(m_A, b_A, 0) + b_A = tl.where(o_t[:, None] > o_t[None, :], b_A, 0) + p_A = tl.make_block_ptr(A + (bos * H + i_h) * BT, (T, BT), (BT * H, 1), + (i_t * BT, 0), (BT, BT), (1, 0)) + tl.store(p_A, b_A.to(p_A.dtype.element_ty), boundary_check=(0, 1)) + + +def chunk_scaled_dot_kkt_fwd( + k: torch.Tensor, + beta: torch.Tensor, + g_cumsum: Optional[torch.Tensor] = None, + cu_seqlens: Optional[torch.LongTensor] = None, + chunk_size: int = 64, + output_dtype: torch.dtype = torch.float32) -> torch.Tensor: + r""" + Compute beta * K * K^T. + + Args: + k (torch.Tensor): + The key tensor of shape `[B, T, H, K]`. + beta (torch.Tensor): + The beta tensor of shape `[B, T, H]`. + g_cumsum (torch.Tensor): + The cumulative sum of the gate tensor of shape `[B, T, H]`. + Default: None + cu_seqlens (torch.LongTensor): + The cumulative sequence lengths of the input tensor. + Default: None + chunk_size (int): + The chunk size. Default: 64. + output_dtype (torch.dtype): + The dtype of the output tensor. Default: `torch.float32` + + Returns: + beta * K * K^T of shape `[B, T, H, BT]` where `BT` is the chunk size. + """ + + B, T, Hg, K = k.shape + + H = beta.shape[-1] + BT = chunk_size + chunk_indices = prepare_chunk_indices( + cu_seqlens, BT) if cu_seqlens is not None else None + NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices) + A = torch.empty(B, T, H, BT, device=k.device, dtype=output_dtype) + chunk_scaled_dot_kkt_fwd_kernel[(NT, B * H)]( + k=k, + beta=beta, + g_cumsum=g_cumsum, + A=A, + cu_seqlens=cu_seqlens, + chunk_indices=chunk_indices, + T=T, + H=H, + Hg=Hg, + K=K, + BT=BT, + BK=64, + ) + return A \ No newline at end of file diff --git a/vllm_kunlun/ops/fla/cumsum.py b/vllm_kunlun/ops/fla/cumsum.py new file mode 100644 index 0000000..318147a --- /dev/null +++ b/vllm_kunlun/ops/fla/cumsum.py @@ -0,0 +1,229 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang +# ruff: noqa: E501 +import warnings +from typing import Optional + +import torch + +from vllm.triton_utils import tl, triton + +from .index import prepare_chunk_indices +from .utils import check_shared_mem, input_guard + +BS_LIST = [32, 64] if check_shared_mem() else [16, 32] + + +@triton.heuristics({'IS_VARLEN': lambda args: args['cu_seqlens'] is not None}) +# @triton.autotune(configs=[ +# triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8] +# ], +# key=['B', 'H', 'BT', 'IS_VARLEN', 'REVERSE']) +@triton.jit(do_not_specialize=['T']) +def chunk_local_cumsum_scalar_kernel( + s, + o, + cu_seqlens, + chunk_indices, + T, + B: tl.constexpr, + H: tl.constexpr, + BT: tl.constexpr, + REVERSE: tl.constexpr, + IS_VARLEN: tl.constexpr, + HEAD_FIRST: tl.constexpr, +): + i_t, i_bh = tl.program_id(0), tl.program_id(1) + i_b, i_h = i_bh // H, i_bh % H + if IS_VARLEN: + i_n, i_t = tl.load(chunk_indices + i_t * 2).to( + tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32) + bos, eos = tl.load(cu_seqlens + i_n).to( + tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32) + T = eos - bos + else: + bos, eos = i_b * T, i_b * T + T + + if HEAD_FIRST: + p_s = tl.make_block_ptr(s + bos * H + i_h * T, (T, ), (1, ), + (i_t * BT, ), (BT, ), (0, )) + p_o = tl.make_block_ptr(o + bos * H + i_h * T, (T, ), (1, ), + (i_t * BT, ), (BT, ), (0, )) + else: + p_s = tl.make_block_ptr(s + bos * H + i_h, (T, ), (H, ), (i_t * BT, ), + (BT, ), (0, )) + p_o = tl.make_block_ptr(o + bos * H + i_h, (T, ), (H, ), (i_t * BT, ), + (BT, ), (0, )) + # [BT] + b_s = tl.load(p_s, boundary_check=(0, )).to(tl.float32) + b_o = tl.cumsum(b_s, axis=0) + if REVERSE: + b_z = tl.sum(b_s, axis=0) + b_o = -b_o + b_z[None] + b_s + tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, )) + + +@triton.heuristics({'IS_VARLEN': lambda args: args['cu_seqlens'] is not None}) +# @triton.autotune(configs=[ +# triton.Config({'BS': BS}, num_warps=num_warps) for BS in BS_LIST +# for num_warps in [2, 4, 8] +# ], +# key=['B', 'H', 'S', 'BT', 'IS_VARLEN', 'REVERSE']) +@triton.jit(do_not_specialize=['T']) +def chunk_local_cumsum_vector_kernel( + s, + o, + cu_seqlens, + chunk_indices, + T, + B: tl.constexpr, + H: tl.constexpr, + S: tl.constexpr, + BT: tl.constexpr, + BS: tl.constexpr, + REVERSE: tl.constexpr, + IS_VARLEN: tl.constexpr, + HEAD_FIRST: tl.constexpr, +): + i_s, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_b, i_h = i_bh // H, i_bh % H + if IS_VARLEN: + i_n, i_t = tl.load(chunk_indices + i_t * 2).to( + tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32) + bos, eos = tl.load(cu_seqlens + i_n).to( + tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32) + T = eos - bos + else: + bos, eos = i_b * T, i_b * T + T + + o_i = tl.arange(0, BT) + if REVERSE: + m_s = tl.where(o_i[:, None] <= o_i[None, :], 1., 0.) + else: + m_s = tl.where(o_i[:, None] >= o_i[None, :], 1., 0.) + + if HEAD_FIRST: + p_s = tl.make_block_ptr(s + (bos * H + i_h * T) * S, (T, S), (S, 1), + (i_t * BT, i_s * BS), (BT, BS), (1, 0)) + p_o = tl.make_block_ptr(o + (bos * H + i_h * T) * S, (T, S), (S, 1), + (i_t * BT, i_s * BS), (BT, BS), (1, 0)) + else: + p_s = tl.make_block_ptr(s + (bos * H + i_h) * S, (T, S), (H * S, 1), + (i_t * BT, i_s * BS), (BT, BS), (1, 0)) + p_o = tl.make_block_ptr(o + (bos * H + i_h) * S, (T, S), (H * S, 1), + (i_t * BT, i_s * BS), (BT, BS), (1, 0)) + # [BT, BS] + b_s = tl.load(p_s, boundary_check=(0, 1)).to(tl.float32) + b_o = tl.dot(m_s, b_s, allow_tf32=False) + tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1)) + + +def chunk_local_cumsum_scalar( + g: torch.Tensor, + chunk_size: int, + reverse: bool = False, + cu_seqlens: Optional[torch.Tensor] = None, + head_first: bool = False, + output_dtype: Optional[torch.dtype] = torch.float) -> torch.Tensor: + if head_first: + B, H, T = g.shape + else: + B, T, H = g.shape + assert chunk_size == 2**(chunk_size.bit_length() - + 1), "chunk_size must be a power of 2" + BT = chunk_size + chunk_indices = prepare_chunk_indices( + cu_seqlens, BT) if cu_seqlens is not None else None + NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices) + g_org, g = g, torch.empty_like(g, dtype=output_dtype or g.dtype) + grid = (NT, B * H) + chunk_local_cumsum_scalar_kernel[grid]( + s=g_org, + o=g, + cu_seqlens=cu_seqlens, + chunk_indices=chunk_indices, + T=T, + B=B, + H=H, + BT=BT, + HEAD_FIRST=head_first, + REVERSE=reverse, + is_use_mask_zero = True + ) + return g + + +def chunk_local_cumsum_vector( + g: torch.Tensor, + chunk_size: int, + reverse: bool = False, + cu_seqlens: Optional[torch.Tensor] = None, + head_first: bool = False, + output_dtype: Optional[torch.dtype] = torch.float) -> torch.Tensor: + if head_first: + B, H, T, S = g.shape + else: + B, T, H, S = g.shape + BT = chunk_size + chunk_indices = prepare_chunk_indices( + cu_seqlens, chunk_size) if cu_seqlens is not None else None + NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices) + assert chunk_size == 2**(chunk_size.bit_length() - + 1), "chunk_size must be a power of 2" + + g_org, g = g, torch.empty_like(g, dtype=output_dtype or g.dtype) + + def grid(meta): + return (triton.cdiv(meta['S'], meta['BS']), NT, B * H) + + # keep cumulative normalizer in fp32 + # this kernel is equivalent to + # g = g.view(B, H, NT, BT, -1).cumsum(-2).view(B, H, T, -1) + chunk_local_cumsum_vector_kernel[grid](g_org, + g, + cu_seqlens, + chunk_indices, + T=T, + B=B, + H=H, + S=S, + BT=BT, + HEAD_FIRST=head_first, + REVERSE=reverse) + return g + + +@input_guard +def chunk_local_cumsum(g: torch.Tensor, + chunk_size: int, + reverse: bool = False, + cu_seqlens: Optional[torch.Tensor] = None, + head_first: bool = False, + output_dtype: Optional[torch.dtype] = torch.float, + **kwargs) -> torch.Tensor: + if not head_first and g.shape[1] < g.shape[2]: + warnings.warn( + f"Input tensor shape suggests potential format mismatch: seq_len ({g.shape[1]}) < num_heads ({g.shape[2]}). " + "This may indicate the inputs were passed in head-first format [B, H, T, ...] " + "when head_first=False was specified. " + "Please verify your input tensor format matches the expected shape [B, T, H, ...].", + stacklevel=2) + if cu_seqlens is not None: + assert g.shape[ + 0] == 1, "Only batch size 1 is supported when cu_seqlens are provided" + if len(g.shape) == 3: + return chunk_local_cumsum_scalar(g, chunk_size, reverse, cu_seqlens, + head_first, output_dtype) + elif len(g.shape) == 4: + return chunk_local_cumsum_vector(g, chunk_size, reverse, cu_seqlens, + head_first, output_dtype) + else: + raise ValueError(f"Unsupported input shape {g.shape}. " + f"which should be (B, T, H, D) if `head_first=False` " + f"or (B, H, T, D) otherwise") \ No newline at end of file diff --git a/vllm_kunlun/ops/fla/fused_recurrent.py b/vllm_kunlun/ops/fla/fused_recurrent.py new file mode 100644 index 0000000..143b6a0 --- /dev/null +++ b/vllm_kunlun/ops/fla/fused_recurrent.py @@ -0,0 +1,153 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang +# ruff: noqa: E501 +from typing import Optional + +import torch + +import xtorch_ops + + +class FusedRecurrentFunction(torch.autograd.Function): + + @staticmethod + def forward(ctx, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + beta: torch.Tensor, + scale: float, + initial_state: torch.Tensor, + inplace_final_state: bool = True, + cu_seqlens: Optional[torch.LongTensor] = None, + ssm_state_indices: Optional[torch.Tensor] = None, + num_accepted_tokens: Optional[torch.Tensor] = None, + use_qk_l2norm_in_kernel: bool = False): + + o, final_state = xtorch_ops.fused_recurrent_gated_delta_rule_fwdv2( + q.contiguous(), + k.contiguous(), + v.contiguous(), + g.contiguous(), + beta.contiguous(), + scale, + initial_state, + inplace_final_state=inplace_final_state, + cu_seqlens=cu_seqlens, + h0_indices=ssm_state_indices, + num_accepted_tokens=num_accepted_tokens, + use_qk_l2norm_in_kernel=use_qk_l2norm_in_kernel, + ) + return o, final_state + + +def fused_recurrent_gated_delta_rule( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + beta: torch.Tensor = None, + scale: float = None, + initial_state: torch.Tensor = None, + inplace_final_state: bool = True, + cu_seqlens: Optional[torch.LongTensor] = None, + ssm_state_indices: Optional[torch.Tensor] = None, + num_accepted_tokens: Optional[torch.Tensor] = None, + use_qk_l2norm_in_kernel: bool = False, +) -> tuple[torch.Tensor, torch.Tensor]: + r""" + Args: + q (torch.Tensor): + queries of shape `[B, T, H, K]`. + k (torch.Tensor): + keys of shape `[B, T, H, K]`. + v (torch.Tensor): + values of shape `[B, T, HV, V]`. + GVA is applied if `HV > H`. + g (torch.Tensor): + g (decays) of shape `[B, T, HV]`. + beta (torch.Tensor): + betas of shape `[B, T, HV]`. + scale (Optional[int]): + Scale factor for the RetNet attention scores. + If not provided, it will default to `1 / sqrt(K)`. Default: `None`. + initial_state (Optional[torch.Tensor]): + Initial state of shape `[N, HV, K, V]` for `N` input sequences. + For equal-length input sequences, `N` equals the batch size `B`. + Default: `None`. + inplace_final_state: bool: + Whether to store the final state in-place to save memory. + Default: `True`. + cu_seqlens (torch.LongTensor): + Cumulative sequence lengths of shape `[N+1]` used for variable-length training, + consistent with the FlashAttention API. + ssm_state_indices (Optional[torch.Tensor]): + Indices to map the input sequences to the initial/final states. + num_accepted_tokens (Optional[torch.Tensor]): + Number of accepted tokens for each sequence during decoding. + + Returns: + o (torch.Tensor): + Outputs of shape `[B, T, HV, V]`. + final_state (torch.Tensor): + Final state of shape `[N, HV, K, V]`. + + Examples:: + >>> import torch + >>> import torch.nn.functional as F + >>> from einops import rearrange + >>> from fla.ops.gated_delta_rule import fused_recurrent_gated_delta_rule + # inputs with equal lengths + >>> B, T, H, HV, K, V = 4, 2048, 4, 8, 512, 512 + >>> q = torch.randn(B, T, H, K, device='cuda') + >>> k = F.normalize(torch.randn(B, T, H, K, device='cuda'), p=2, dim=-1) + >>> v = torch.randn(B, T, HV, V, device='cuda') + >>> g = F.logsigmoid(torch.rand(B, T, HV, device='cuda')) + >>> beta = torch.rand(B, T, HV, device='cuda').sigmoid() + >>> h0 = torch.randn(B, HV, K, V, device='cuda') + >>> o, ht = fused_gated_recurrent_delta_rule( + q, k, v, g, beta, + initial_state=h0, + ) + # for variable-length inputs, the batch size `B` is expected to be 1 and `cu_seqlens` is required + >>> q, k, v, g, beta = map(lambda x: rearrange(x, 'b t ... -> 1 (b t) ...'), (q, k, v, g, beta)) + # for a batch with 4 sequences, `cu_seqlens` with 5 start/end positions are expected + >>> cu_seqlens = q.new_tensor([0, 2048, 4096, 6144, 8192], dtype=torch.long) + >>> o_var, ht_var = fused_gated_recurrent_delta_rule( + q, k, v, g, beta, + initial_state=h0, + cu_seqlens=cu_seqlens + ) + """ + if cu_seqlens is not None and q.shape[0] != 1: + raise ValueError( + f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`." + f"Please flatten variable-length inputs before processing.") + if scale is None: + scale = k.shape[-1]**-0.5 + else: + assert scale > 0, "scale must be positive" + if beta is None: + beta = torch.ones_like(q[..., 0]) + o, final_state = FusedRecurrentFunction.apply( + q, + k, + v, + g, + beta, + scale, + initial_state, + inplace_final_state, + cu_seqlens, + ssm_state_indices, + num_accepted_tokens, + use_qk_l2norm_in_kernel, + ) + return o, final_state \ No newline at end of file diff --git a/vllm_kunlun/ops/fla/index.py b/vllm_kunlun/ops/fla/index.py new file mode 100644 index 0000000..61b0859 --- /dev/null +++ b/vllm_kunlun/ops/fla/index.py @@ -0,0 +1,38 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang +# ruff: noqa: E501 +import torch + +from vllm.triton_utils import triton + +from .utils import tensor_cache + + +@tensor_cache +def prepare_lens(cu_seqlens: torch.LongTensor) -> torch.LongTensor: + return cu_seqlens[1:] - cu_seqlens[:-1] + + +@tensor_cache +def prepare_chunk_indices(cu_seqlens: torch.LongTensor, + chunk_size: int) -> torch.LongTensor: + indices = torch.cat([ + torch.arange(n) + for n in triton.cdiv(prepare_lens(cu_seqlens), chunk_size).tolist() + ]) + return torch.stack([indices.eq(0).cumsum(0) - 1, indices], + 1).to(cu_seqlens) + +@tensor_cache +def prepare_chunk_offsets(cu_seqlens: torch.LongTensor, + chunk_size: int) -> torch.LongTensor: + return torch.cat([ + cu_seqlens.new_tensor([0]), + triton.cdiv(prepare_lens(cu_seqlens), chunk_size) + ]).cumsum(-1) diff --git a/vllm_kunlun/ops/fla/l2norm.py b/vllm_kunlun/ops/fla/l2norm.py new file mode 100644 index 0000000..ef9788c --- /dev/null +++ b/vllm_kunlun/ops/fla/l2norm.py @@ -0,0 +1,143 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang + +import os +from typing import Optional + +import torch + +from vllm.triton_utils import tl, triton + +BT_LIST = [8, 16, 32, 64, 128] + +USE_DEFAULT_FLA_NORM = int(os.getenv("USE_DEFAULT_FLA_NORM", "0")) + + +@triton.autotune(configs=[ + triton.Config({}, num_warps=num_warps) + for num_warps in [1, 2, 4, 8, 16, 32] +], + key=['D']) +@triton.jit +def l2norm_fwd_kernel1( + x, + y, + D, + BD: tl.constexpr, + eps, +): + i_t = tl.program_id(0) + x += i_t * D + y += i_t * D + # Compute mean and variance + cols = tl.arange(0, BD) + mask = cols < D + b_x = tl.load(x + cols, mask=mask, other=0.0).to(tl.float32) + b_var = tl.sum(b_x * b_x, axis=0) + b_rstd = 1 / tl.sqrt(b_var + eps) + # tl.store(Rstd + i_t, rstd) + # Normalize and apply linear transformation + b_y = b_x * b_rstd + tl.store(y + cols, b_y, mask=mask) + + +@triton.autotune(configs=[ + triton.Config({'BT': BT}, num_warps=num_warps) + for num_warps in [1, 2, 4, 8, 16] for BT in BT_LIST +], + key=['D']) +@triton.jit(do_not_specialize=["NB"]) +def l2norm_fwd_kernel( + x, + y, + eps, + NB, + T, + D: tl.constexpr, + BT: tl.constexpr, + BD: tl.constexpr, +): + i_t = tl.program_id(0) + p_x = tl.make_block_ptr(x, (T, D), (D, 1), (i_t * BT, 0), (BT, BD), (1, 0)) + b_x = tl.load(p_x, boundary_check=(0, 1)).to(tl.float32) + b_var = tl.sum(b_x * b_x, axis=1) + b_y = b_x / tl.sqrt(b_var + eps)[:, None] + p_y = tl.make_block_ptr(y, (T, D), (D, 1), (i_t * BT, 0), (BT, BD), (1, 0)) + tl.store(p_y, b_y.to(p_y.dtype.element_ty), boundary_check=(0, 1)) + + +@triton.jit +def l2norm_fwd_kernel2(X, Y, eps, M, N: tl.constexpr, MBLOCK: tl.constexpr): + xoffset = tl.program_id(0) * MBLOCK + row_idx = xoffset + tl.arange(0, MBLOCK)[:, None] + xmask = row_idx < M + rindex = tl.arange(0, N)[None, :] + xs = tl.load(X + (rindex + N * row_idx), xmask).to(tl.float32) + square = tl.broadcast_to(xs * xs, [MBLOCK, N]) + square_sum = tl.sum(tl.where(xmask, square, 0), 1)[:, None] + rsqrt = tl.rsqrt(square_sum + eps) + tl.store(Y + (rindex + N * row_idx), xs * rsqrt, xmask) + + +def l2norm_fwd(x: torch.Tensor, + eps: float = 1e-6, + output_dtype: Optional[torch.dtype] = None): + x_shape_og = x.shape + x = x.view(-1, x.shape[-1]) + # allocate output + if output_dtype is None: + y = torch.empty_like(x) + else: + y = torch.empty_like(x, dtype=output_dtype) + assert y.stride(-1) == 1 + T, D = x.shape[0], x.shape[-1] + # rstd = torch.empty((T,), dtype=torch.float32, device=x.device) + # Less than 64KB per feature: enqueue fused kernel + MAX_FUSED_SIZE = 65536 // x.element_size() + BD = min(MAX_FUSED_SIZE, triton.next_power_of_2(D)) + if D > BD: + raise RuntimeError("This layer doesn't support feature dim >= 64KB.") + + if not USE_DEFAULT_FLA_NORM: + MBLOCK = 32 + # M, N = x.shape + l2norm_fwd_kernel2[(triton.cdiv(T, MBLOCK), )]( + x, + y, + eps, + T, + D, + MBLOCK, + ) + else: + if D <= 512: + NB = triton.cdiv(T, 2048) + + def grid(meta): + return (triton.cdiv(T, meta['BT']), ) + + l2norm_fwd_kernel[grid]( + x, + y, + eps, + NB=NB, + T=T, + D=D, + BD=BD, + ) + else: + l2norm_fwd_kernel1[(T, )]( + x, + y, + eps=eps, + D=D, + BD=BD, + ) + + return y.view(x_shape_og) diff --git a/vllm_kunlun/ops/fla/layernorm_guard.py b/vllm_kunlun/ops/fla/layernorm_guard.py new file mode 100644 index 0000000..a6a5f43 --- /dev/null +++ b/vllm_kunlun/ops/fla/layernorm_guard.py @@ -0,0 +1,343 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Tri Dao +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2024, Tri Dao. + +# ruff: noqa: E501 +# Based on the Triton LayerNorm tutorial: https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html +# For the backward pass, we keep weight_grad and bias_grad in registers and accumulate. +# This backward pass is faster for dimensions up to 8k, but after that it's much slower due to register spilling. +# The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine. + +from typing import Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange + +from vllm.triton_utils import tl, triton + +from .utils import input_guard + + +def rms_norm_ref(x, + weight, + bias, + z=None, + eps=1e-6, + group_size=None, + norm_before_gate=True, + upcast=True): + dtype = x.dtype + weight = weight.float() + bias = bias.float() if bias is not None else None + if upcast: + x = x.float() + z = z.float() if z is not None else z + if z is not None and not norm_before_gate: + x = x * F.silu(z) + if group_size is None: + rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps) + out = (x * rstd * weight) + bias if bias is not None else (x * rstd * + weight) + else: + x_group = rearrange(x, "... (g d) -> ... g d", d=group_size) + rstd = 1 / torch.sqrt((x_group.square()).mean(dim=-1, keepdim=True) + + eps) + out = rearrange(x_group * rstd, "... g d -> ... (g d)") * weight + if bias is not None: + out = out + bias + if z is not None and norm_before_gate: + out *= F.silu(z) + return out.to(dtype) + + +@triton.heuristics({ + "HAS_BIAS": lambda args: args["B"] is not None, + "HAS_Z": lambda args: args["Z"] is not None, +}) +@triton.jit +def layer_norm_fwd_kernel( + X, # pointer to the input + Y, # pointer to the output + W, # pointer to the weights + B, # pointer to the biases + Z, # pointer to the other branch + Mean, # pointer to the mean + Rstd, # pointer to the 1/std + stride_x_row, # how much to increase the pointer when moving by 1 row + stride_y_row, + stride_z_row, + M, # number of rows in X + N, # number of columns in X + eps, # epsilon to avoid division by zero + BLOCK_N: tl.constexpr, + HAS_BIAS: tl.constexpr, + HAS_Z: tl.constexpr, + NORM_BEFORE_GATE: tl.constexpr, + IS_RMS_NORM: tl.constexpr, +): + # Map the program id to the row of X and Y it should compute. + row = tl.program_id(0) + group = tl.program_id(1) + X += row * stride_x_row + group * N + Y += row * stride_y_row + group * N + if HAS_Z: + Z += row * stride_z_row + group * N + if not IS_RMS_NORM: + Mean += group * M + Rstd += group * M + W += group * N + if HAS_BIAS: + B += group * N + # Compute mean and variance + cols = tl.arange(0, BLOCK_N) + x = tl.load(X + cols, mask=cols < N, other=0.).to(tl.float32) + if HAS_Z and not NORM_BEFORE_GATE: + z = tl.load(Z + cols, mask=cols < N).to(tl.float32) + x *= z * tl.sigmoid(z) + if not IS_RMS_NORM: + mean = tl.sum(x, axis=0) / N + tl.store(Mean + row, mean) + xbar = tl.where(cols < N, x - mean, 0.) + var = tl.sum(xbar * xbar, axis=0) / N + else: + xbar = tl.where(cols < N, x, 0.) + var = tl.sum(xbar * xbar, axis=0) / N + rstd = 1 / tl.sqrt(var + eps) + tl.store(Rstd + row, rstd) + # Normalize and apply linear transformation + mask = cols < N + w = tl.load(W + cols, mask=mask).to(tl.float32) + if HAS_BIAS: + b = tl.load(B + cols, mask=mask).to(tl.float32) + x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd + y = x_hat * w + b if HAS_BIAS else x_hat * w + if HAS_Z and NORM_BEFORE_GATE: + z = tl.load(Z + cols, mask=mask).to(tl.float32) + y *= z * tl.sigmoid(z) + # Write output + tl.store(Y + cols, y, mask=mask) + + +def layer_norm_fwd( + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + eps: float, + z: torch.Tensor = None, + out: torch.Tensor = None, + group_size: int = None, + norm_before_gate: bool = True, + is_rms_norm: bool = False, +): + M, N = x.shape + if group_size is None: + group_size = N + assert N % group_size == 0 + ngroups = N // group_size + assert x.stride(-1) == 1 + if z is not None: + assert z.stride(-1) == 1 + assert z.shape == (M, N) + # if weight.shape != (N,): + # weight = weight.reshape(N) + # print("weight",weight.shape) + # print("x",x.shape) + assert weight.shape == (N, ) + assert weight.stride(-1) == 1 + if bias is not None: + assert bias.stride(-1) == 1 + assert bias.shape == (N, ) + # allocate output + if out is not None: + assert out.shape == x.shape + else: + out = torch.empty_like(x) + assert out.stride(-1) == 1 + mean = torch.empty((ngroups * M, ), dtype=torch.float32, + device=x.device) if not is_rms_norm else None + rstd = torch.empty((ngroups * M, ), dtype=torch.float32, device=x.device) + # Less than 64KB per feature: enqueue fused kernel + MAX_FUSED_SIZE = 65536 // x.element_size() + BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(group_size)) + if group_size > BLOCK_N: + raise RuntimeError( + "This layer norm doesn't support feature dim >= 64KB.") + # heuristics for number of warps + num_warps = min(max(BLOCK_N // 256, 1), 8) + grid = (M, ngroups) + layer_norm_fwd_kernel[grid](x, + out, + weight, + bias, + z, + mean, + rstd, + x.stride(0), + out.stride(0), + z.stride(0) if z is not None else 0, + M, + group_size, + eps, + BLOCK_N=BLOCK_N, + NORM_BEFORE_GATE=norm_before_gate, + IS_RMS_NORM=is_rms_norm, + num_warps=num_warps) + return out, mean, rstd + + +class LayerNormFn(torch.autograd.Function): + + @input_guard + @staticmethod + def forward(ctx, + x, + weight, + bias, + z=None, + eps=1e-6, + group_size=None, + norm_before_gate=True, + is_rms_norm=False): + """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z)) + """ + + x_shape_og = x.shape + # reshape input data into 2D tensor + x = x.reshape(-1, x.shape[-1]) + if x.stride(-1) != 1: + x = x.contiguous() + if z is not None: + # if z.shape != x_shape_og: + # z = z.reshape(x_shape_og) + assert z.shape == x_shape_og + z = z.reshape(-1, z.shape[-1]) + if z.stride(-1) != 1: + z = z.contiguous() + weight = weight.contiguous() + if bias is not None: + bias = bias.contiguous() + y, mean, rstd = layer_norm_fwd( + x, + weight, + bias, + eps, + z=z, + group_size=group_size, + norm_before_gate=norm_before_gate, + is_rms_norm=is_rms_norm, + ) + ctx.save_for_backward(x, weight, bias, mean, rstd, z) + ctx.x_shape_og = x_shape_og + ctx.eps = eps + ctx.group_size = group_size + ctx.norm_before_gate = norm_before_gate + ctx.is_rms_norm = is_rms_norm + return y.reshape(x_shape_og) + + +def layernorm_fn(x, + weight, + bias, + z=None, + eps=1e-6, + group_size=None, + norm_before_gate=True, + is_rms_norm=False): + return LayerNormFn.apply(x, weight, bias, z, eps, group_size, + norm_before_gate, is_rms_norm) + + +def rmsnorm_fn(x, + weight, + bias, + z=None, + eps=1e-6, + group_size=None, + norm_before_gate=True): + return LayerNormFn.apply(x, weight, bias, z, eps, group_size, + norm_before_gate, True) + + +class LayerNormGated(nn.Module): + + def __init__( + self, + hidden_size, + eps: float = 1e-5, + group_size: Optional[int] = None, + norm_before_gate: bool = True, + device: Optional[torch.device] = None, + dtype: Optional[torch.dtype] = None, + ): + """If group_size is not None, we do GroupNorm with each group having group_size elements. + group_size=None is equivalent to group_size=hidden_size (i.e. there's only 1 group). + """ + + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.empty(hidden_size, **factory_kwargs)) + self.bias = nn.Parameter(torch.empty(hidden_size, **factory_kwargs)) + self.group_size = group_size + self.norm_before_gate = norm_before_gate + self.reset_parameters() + + def reset_parameters(self): + torch.nn.init.ones_(self.weight) + torch.nn.init.zeros_(self.bias) + + def forward(self, x, z=None): + """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z)) + """ + return layernorm_fn(x, + self.weight, + self.bias, + z=z, + group_size=self.group_size, + eps=self.eps, + norm_before_gate=self.norm_before_gate) + + +class RMSNormGated(nn.Module): + + def __init__( + self, + hidden_size, + eps: float = 1e-5, + group_size: Optional[int] = None, + norm_before_gate: bool = False, + device: Optional[torch.device] = None, + dtype: Optional[torch.dtype] = None, + ): + """If group_size is not None, we do GroupNorm with each group having group_size elements. + group_size=None is equivalent to group_size=hidden_size (i.e. there's only 1 group). + """ + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.empty(hidden_size, **factory_kwargs)) + self.register_parameter("bias", None) + self.group_size = group_size + self.norm_before_gate = norm_before_gate + self.reset_parameters() + + def reset_parameters(self): + torch.nn.init.ones_(self.weight) + + def forward(self, x, z=None): + """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z)) + """ + return rmsnorm_fn(x, + self.weight, + self.bias, + z=z, + eps=self.eps, + group_size=self.group_size, + norm_before_gate=self.norm_before_gate) diff --git a/vllm_kunlun/ops/fla/op.py b/vllm_kunlun/ops/fla/op.py new file mode 100644 index 0000000..8c29434 --- /dev/null +++ b/vllm_kunlun/ops/fla/op.py @@ -0,0 +1,39 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang + +import os + +from vllm.triton_utils import tl, tldevice, triton + +if os.environ.get('FLA_USE_FAST_OPS', '0') == '1': + div = tldevice.fast_dividef + exp = tldevice.fast_expf + log = tldevice.fast_logf + log2 = tldevice.fast_log2f +else: + + @triton.jit + def div_normal(x, y): + return x / y + + div = div_normal + exp = tl.exp + log = tl.log + log2 = tl.log2 + + +if not hasattr(tl, 'gather'): + + @triton.jit + def gather(src, index, axis, _builder=None): + # This is a fallback implementation when tl.gather is not supported + # In order to pass triton compiler, there is no actual gather operation + return src +else: + gather = tl.gather diff --git a/vllm_kunlun/ops/fla/solve_tril.py b/vllm_kunlun/ops/fla/solve_tril.py new file mode 100644 index 0000000..3b09024 --- /dev/null +++ b/vllm_kunlun/ops/fla/solve_tril.py @@ -0,0 +1,422 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang +# ruff: noqa: E501 +from typing import Optional + +import torch +import os +from vllm.triton_utils import tl, triton + +from .index import prepare_chunk_indices +from .utils import input_guard + +base_dir = os.path.dirname(__file__) + +def prepare_lens(cu_seqlens: torch.LongTensor) -> torch.LongTensor: + return cu_seqlens[1:] - cu_seqlens[:-1] + +def prepare_chunk_indices( + cu_seqlens: torch.LongTensor, chunk_size: int +) -> torch.LongTensor: + indices = torch.cat( + [ + torch.arange(n) + for n in triton.cdiv(prepare_lens(cu_seqlens), chunk_size).tolist() + ] + ) + return torch.stack([indices.eq(0).cumsum(0) - 1, indices], 1).to(cu_seqlens) + + +@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None}) +# @triton.autotune( +# configs=[ +# triton.Config({}, num_warps=num_warps, num_stages=num_stages) +# for num_warps in [1, 2, 4, 8] +# for num_stages in [2, 3, 4, 5] +# ], +# key=["BT"], +# ) +@triton.jit(do_not_specialize=["T"]) +def solve_tril_16x16_kernel( + A, + Ad, + cu_seqlens, + chunk_indices, + T, + H: tl.constexpr, + BT: tl.constexpr, + IS_VARLEN: tl.constexpr, +): + i_t, i_bh = tl.program_id(0), tl.program_id(1) + i_b, i_h = i_bh // H, i_bh % H + if IS_VARLEN: + i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load( + chunk_indices + i_t * 2 + 1 + ).to(tl.int32) + bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load( + cu_seqlens + i_n + 1 + ).to(tl.int32) + T = eos - bos + else: + bos, eos = i_b * T, i_b * T + T + + A = A + (bos * H + i_h) * BT + Ad = Ad + (bos * H + i_h) * 16 + + offset = (i_t * 16) % BT + p_A = tl.make_block_ptr( + A, (T, BT), (H * BT, 1), (i_t * 16, offset), (16, 16), (1, 0) + ) + p_Ai = tl.make_block_ptr(Ad, (T, 16), (H * 16, 1), (i_t * 16, 0), (16, 16), (1, 0)) + b_A = tl.load(p_A, boundary_check=(0, 1)).to(tl.float32) + b_A = -tl.where(tl.arange(0, 16)[:, None] > tl.arange(0, 16)[None, :], b_A, 0) + + o_i = tl.arange(0, 16) + for i in range(1, min(16, T - i_t * 16)): + b_a = -tl.load(A + (i_t * 16 + i) * H * BT + o_i + offset) + b_a = b_a + tl.sum(b_a[:, None] * b_A, 0) + mask = o_i == i + b_A = tl.where(mask[:, None], b_a, b_A) + b_A += o_i[:, None] == o_i[None, :] + tl.store( + p_Ai, + b_A.to(p_Ai.dtype.element_ty, fp_downcast_rounding="rtne"), + boundary_check=(0, 1), + ) + + +@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None}) +@triton.jit(do_not_specialize=["T"]) +def solve_tril_16x16_kernel_modified( + i_t, + i_bh, + i_n, + bos, + i_b, + i_h, + subA, + subAd, + A, + Ad, + cu_seqlens, + chunk_indices, + T, # 32 + H: tl.constexpr, # 4 + BT: tl.constexpr, # 64 + IS_VARLEN: tl.constexpr, +): + A = A + (bos * H + i_h) * BT + print("for A Base offset ", (bos * H + i_h) * BT) + + offset = (i_t * 16) % BT + + range16 = tl.arange(0, 16) + newp_A = subA + range16[:, None] * 16 + range16[None, :] + b_A = tl.load(newp_A).to(tl.float32) + + o_i = tl.arange(0, 16) + for i in range(1, min(16, T - i_t * 16)): + print("[naive impl-0]loopIdx:", i) + # print("for A start (i_t * 16 + i) * H * BT", (i_t * 16 + i) * H * BT) + # print("for A start offset", offset) + # print("for A start", (i_t * 16 + i) * H * BT + offset) + print("[naive impl-1]b_A value in now loopIdx:", b_A) + b_a = -tl.load(A + (i_t * 16 + i) * H * BT + o_i + offset) + # print("[naive impl-2]b_a value in now loopIdx:", b_a) + b_a = b_a + tl.sum(b_a[:, None] * b_A, 0) + print("[naive impl-2-1]b_a value after reduce in now loopIdx:", b_a) + mask = o_i == i + b_A = tl.where(mask[:, None], b_a, b_A) + print("[naive impl-2-2]b_A value after oimask in now loopIdx:", b_A) + # print("[naive impl-3]b_A result in now loopIdx:", b_A) + # print(f"[naive impl-4] b_A value after allLoop = {b_A}") + b_A += o_i[:, None] == o_i[None, :] + # print(f"[naive impl-5] b_A value after mask = {b_A}") + + newp_Ad = subAd + range16[:, None] * 16 + range16[None, :] + tl.store( + newp_Ad, + b_A.to(subAd.dtype.element_ty, fp_downcast_rounding="rtne"), + ) + + +@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None}) +# @triton.autotune( +# configs=[ +# triton.Config({}, num_warps=num_warps, num_stages=num_stages) +# for num_warps in [1, 2, 4, 8] +# for num_stages in [2, 3, 4, 5] +# ], +# key=["BT"], +# ) +@triton.jit(do_not_specialize=["T"]) +def solve_tril_16x16_kernel_modified_in_Loop( + i_t, + i_bh, + i_n, + bos, + i_b, + i_h, + subA, + subAd, + AInLoop, + ba_reduce, + loopIdx, + reduce_res, + A, + Ad, + cu_seqlens, + chunk_indices, + T, # 32 + H: tl.constexpr, # 4 + BT: tl.constexpr, # 64 + IS_VARLEN: tl.constexpr, +): + range16 = tl.arange(0, 16) + newp_A = subA + range16[:, None] * 16 + range16[None, :] + b_A = tl.load(newp_A).to(tl.float32) + # print("[loop impl-0]loopIdx:", loopIdx) + # print("[loop impl-1]b_A value in now loopIdx:", b_A) + + o_i = tl.arange(0, 16) + i=loopIdx + b_a = -tl.load(AInLoop + o_i) + # print("[loop impl-2]b_a value in now loopIdx:", b_a) + red_res = b_a[:, None] * b_A + # print("[Triton]red_res=", red_res) + tl.store(reduce_res + range16[:, None] * 16 + range16[None, :], red_res) + # b_a = b_a + tl.sum(b_a[:, None] * b_A, 1) # TODO: revert to 0 + # # print("triton reduce b_a", b_a) + # tl.store(ba_reduce + o_i, b_a) + + # mask = o_i == i + # # print("mask", mask[:, None]) + # # print("b_a", b_a) + # # print("b_A", b_A) + # print("before b_A", b_A) + # b_A = tl.where(mask[:, None], b_a, b_A) + # print("[loop impl-3]b_A result in now loopIdx:", b_A) + + # tl.store(newp_A, b_A) + + +def solve_tril_16x16_kernel_new( + NT, + B, + A, + Ad, + cu_seqlens, + chunk_indices, + T, + H, + BT, + IS_VARLEN, +): + Ad_modify = Ad + for loopX in range(NT): + # i_n, i_t = tl.load(chunk_indices ... + chunk_indices_load_offset_1 = loopX * 2 + row_idx = chunk_indices_load_offset_1 // chunk_indices.shape[1] + col_idx = chunk_indices_load_offset_1 % chunk_indices.shape[1] + i_n = int(chunk_indices[row_idx, col_idx]) + chunk_indices_load_offset_2 = loopX * 2 + 1 + row_idx = chunk_indices_load_offset_2 // chunk_indices.shape[1] + col_idx = chunk_indices_load_offset_2 % chunk_indices.shape[1] + i_t = int(chunk_indices[row_idx, col_idx]) + + # bos, eos = tl.load(cu_seqlens ... + cu_seqlens_load_offset_1 = i_n + bos = int(cu_seqlens[cu_seqlens_load_offset_1]) + cu_seqlens_load_offset_2 = i_n + 1 + eos = int(cu_seqlens[cu_seqlens_load_offset_2]) + T = eos - bos + + for loopY in range(B * H): + i_b = loopY // H + i_h = loopY % H + + # get subA + if (bos * H + i_h) < H: + Tstart = loopX * 16 % BT + Tend = Tstart + 16 + BTstart = loopX * 16 % BT + BTend = BTstart + 16 + subA = A[0, Tstart:Tend, loopY, BTstart:BTend].contiguous().clone() + # print(f"subA slice A dim[0, {Tstart}:{Tend}, {loopY}, {BTstart}:{BTend}]") + if (Tend > T): # bondary check + subA[T-16:, :] = 0 + + # subA.shape torch.Size([9, 16]) + # vvv + # subA.shape torch.Size([16, 16]) 用0补齐 + if subA.shape[0] < 16: + pad_rows = 16 - subA.shape[0] + zeros = torch.zeros((pad_rows, subA.shape[1]), dtype=subA.dtype, device=subA.device) + subA = torch.cat([subA, zeros], dim=0) + else: + assert(0) & "need deal this situation" + + # get subAd + if (bos * H + i_h) < H: + Tstart = loopX * 16 + Tend = Tstart + 16 + BTstart = 0 * 16 + BTend = BTstart + 16 + subAd = Ad_modify[0, Tstart:Tend, loopY, BTstart:BTend].contiguous().clone() + # print(f'T={T}, Tstart={Tstart}, Tend={Tend}, BTstart={BTstart}, BTend={BTend}') + else: + assert(0) & "need deal this situation" + + mask = (torch.arange(16, device=subA.device)[:, None] > torch.arange(16, device=subA.device)[None, :]) + subA = -torch.where(mask, subA, torch.zeros_like(subA)) + + for inLoopIdx in range(1, min(16, T - i_t * 16)): + # print(f"loopX={loopX}, loopY={loopY}, inLoopIdx={inLoopIdx}") + offsetStart=loopX*16 % BT + offsetEnd=offsetStart+16 + + AInLoop = A[0, (loopX * 16 + inLoopIdx), loopY, offsetStart:offsetEnd] + # print(f"AInLoop slice A dim[0, {(loopX * 16 + inLoopIdx)}, {loopY}, {offsetStart}:{offsetEnd}") + + ba_reduce = torch.empty_like(AInLoop) + reduce_res = torch.empty_like(subA) + solve_tril_16x16_kernel_modified_in_Loop[1, 1]( + i_t, + loopY, + i_n, + bos, + i_b, + i_h, + subA=subA, + subAd=subAd, + AInLoop=AInLoop, + ba_reduce=ba_reduce, + loopIdx=inLoopIdx, + reduce_res=reduce_res, + A=A, + Ad=Ad_modify, + cu_seqlens=cu_seqlens, + chunk_indices=chunk_indices, + T=T, + H=H, + BT=BT, + num_warps=1, + num_stages=4, + ) + AInLoop = AInLoop.flatten() + b_A = subA # [16x16] + b_a = -AInLoop[0:16] # [16] + b_a = b_a + torch.sum(reduce_res, 0) + ba_reduce = b_a + o_i = torch.arange(16, device=ba_reduce.device) + mask = (o_i == inLoopIdx) + mask_expand = mask[:, None] + subA = torch.where(mask_expand, ba_reduce, subA) + + subAd = subA + (torch.arange(16, device=subA.device)[:, None] == torch.arange(16, device=subA.device)[None, :]) + + # deal store mask + Tstart = loopX * 16 + Tend = Tstart + 16 + BTstart = 0 * 16 + BTend = BTstart + 16 + # print(f"slice Ad_modify dim[0, {Tend-needMaskRow}:{Tend}, {loopY}, {BTstart}:{BTend}]") + if (Tend > T): # bondary mask + needMaskRow = Tend - T + Ad_modify[0, Tstart:Tend, loopY, BTstart:BTend] = subAd[:T-Tstart, :] + else: + # assert (Ad_modify[0, Tstart:Tend, loopY, BTstart:BTend].shape == subAd.shape) + Ad_modify[0, Tstart:Tend, loopY, BTstart:BTend] = subAd + + # if BT == 16: + # return Ad + + return Ad_modify + +# @input_guard +def solve_tril( + A: torch.Tensor, + cu_seqlens: Optional[torch.Tensor] = None, + output_dtype: torch.dtype = torch.float, +) -> torch.Tensor: + """ + Compute the inverse of the lower triangular matrix + A should be strictly lower triangular, i.e., A.triu() == 0. + + Args: + A (torch.Tensor): + [B, T, H, K] + cu_seqlens (torch.Tensor): + The cumulative sequence lengths of the input tensor. + Default: None. + output_dtype (torch.dtype): + The dtype of the output tensor. Default: `torch.float` + + Returns: + (I + A)^-1 with the same shape as A + """ + assert A.shape[-1] in [16, 32, 64] + + B, T, H, BT = A.shape + # cnt = 0 + # for b in range(B): + # for t in range(T): + # for h in range(H): + # for d in range(BT): + # A[b, t, h, d] = cnt + # cnt += 1 + + Ad = -999 * torch.ones( + B, T, H, 16, device=A.device, dtype=torch.float if BT != 16 else output_dtype + ) + # cnt = 0 + # for b in range(B): + # for t in range(T): + # for h in range(H): + # for d in range(16): + # Ad[b, t, h, d] = cnt + # cnt += 1 + + Ad_modify = Ad.clone() + + chunk_indices = ( + prepare_chunk_indices(cu_seqlens, 16) if cu_seqlens is not None else None + ) + NT = len(chunk_indices) if cu_seqlens is not None else triton.cdiv(T, 16) + + import os + if os.getenv("TRITON_INTERPRET", None) == "1": + solve_tril_16x16_kernel[NT, B * H]( + A=A, + Ad=Ad, + cu_seqlens=cu_seqlens, + chunk_indices=chunk_indices, + T=T, + H=H, + BT=BT, + num_warps=1, + num_stages=4, + ) + return Ad + + Ad_modify = solve_tril_16x16_kernel_new( + NT, + B, + A=A, + Ad=Ad_modify, + cu_seqlens=cu_seqlens, + chunk_indices=chunk_indices, + T=T, + H=H, + BT=BT, + IS_VARLEN= True if cu_seqlens is not None else False, + # num_warps=1, + # num_stages=4, + ).to(A.dtype) + return Ad_modify \ No newline at end of file diff --git a/vllm_kunlun/ops/fla/torch_fla.py b/vllm_kunlun/ops/fla/torch_fla.py new file mode 100644 index 0000000..6135e69 --- /dev/null +++ b/vllm_kunlun/ops/fla/torch_fla.py @@ -0,0 +1,85 @@ +import torch +import torch.nn.functional as F +def l2norm(x: torch.FloatTensor, dim: int = -1, eps: float = 1e-6): + """This function is intended to align with the l2norm implementation in the FLA library.""" + inv_norm = torch.rsqrt((x * x).sum(dim=dim, keepdim=True) + eps) + return x * inv_norm + +def torch_chunk_gated_delta_rule( + query, + key, + value, + g, + beta, + chunk_size=64, + initial_state=None, + output_final_state=False, + use_qk_l2norm_in_kernel=False, +): + initial_dtype = query.dtype + if use_qk_l2norm_in_kernel: + query = l2norm(query, dim=-1, eps=1e-6) + key = l2norm(key, dim=-1, eps=1e-6) + query, key, value, beta, g = [ + x.transpose(1, 2).contiguous().to(torch.float32) for x in (query, key, value, beta, g) + ] + + batch_size, num_heads, sequence_length, k_head_dim = key.shape + v_head_dim = value.shape[-1] + pad_size = (chunk_size - sequence_length % chunk_size) % chunk_size + query = F.pad(query, (0, 0, 0, pad_size)) + key = F.pad(key, (0, 0, 0, pad_size)) + value = F.pad(value, (0, 0, 0, pad_size)) + beta = F.pad(beta, (0, pad_size)) + g = F.pad(g, (0, pad_size)) + total_sequence_length = sequence_length + pad_size + scale = 1 / (query.shape[-1] ** 0.5) + query = query * scale + + v_beta = value * beta.unsqueeze(-1) + k_beta = key * beta.unsqueeze(-1) + # reshape to chunks + query, key, value, k_beta, v_beta = [ + x.reshape(x.shape[0], x.shape[1], -1, chunk_size, x.shape[-1]) for x in (query, key, value, k_beta, v_beta) + ] + g = g.reshape(g.shape[0], g.shape[1], -1, chunk_size) + mask = torch.triu(torch.ones(chunk_size, chunk_size, dtype=torch.bool, device=query.device), diagonal=0) + + # chunk decay + g = g.cumsum(dim=-1) + decay_mask = ((g.unsqueeze(-1) - g.unsqueeze(-2)).tril().exp().float()).tril() + attn = -((k_beta @ key.transpose(-1, -2)) * decay_mask).masked_fill(mask, 0) + for i in range(1, chunk_size): + row = attn[..., i, :i].clone() + sub = attn[..., :i, :i].clone() + attn[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2) + attn = attn + torch.eye(chunk_size, dtype=attn.dtype, device=attn.device) + value = attn @ v_beta + k_cumdecay = attn @ (k_beta * g.exp().unsqueeze(-1)) + last_recurrent_state = ( + torch.zeros(batch_size, num_heads, k_head_dim, v_head_dim).to(value) + if initial_state is None + else initial_state.to(value) + ) + core_attn_out = torch.zeros_like(value) + mask = torch.triu(torch.ones(chunk_size, chunk_size, dtype=torch.bool, device=query.device), diagonal=1) + + # for each chunk + for i in range(0, total_sequence_length // chunk_size): + q_i, k_i, v_i = query[:, :, i], key[:, :, i], value[:, :, i] + attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0) + v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state + v_new = v_i - v_prime + attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state + core_attn_out[:, :, i] = attn_inter + attn @ v_new + last_recurrent_state = ( + last_recurrent_state * g[:, :, i, -1, None, None].exp() + + (k_i * (g[:, :, i, -1, None] - g[:, :, i]).exp()[..., None]).transpose(-1, -2) @ v_new + ) + + if not output_final_state: + last_recurrent_state = None + core_attn_out = core_attn_out.reshape(core_attn_out.shape[0], core_attn_out.shape[1], -1, core_attn_out.shape[-1]) + core_attn_out = core_attn_out[:, :, :sequence_length] + core_attn_out = core_attn_out.transpose(1, 2).contiguous().to(initial_dtype) + return core_attn_out, last_recurrent_state \ No newline at end of file diff --git a/vllm_kunlun/ops/fla/utils.py b/vllm_kunlun/ops/fla/utils.py new file mode 100644 index 0000000..a8730a8 --- /dev/null +++ b/vllm_kunlun/ops/fla/utils.py @@ -0,0 +1,180 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang +# ruff: noqa: E501 +import contextlib +import functools +import logging +import os +from enum import Enum +from typing import Any, Callable, Literal, Optional + +import torch + +from vllm.triton_utils import triton + +logger = logging.getLogger(__name__) + +COMPILER_MODE = os.getenv("FLA_COMPILER_MODE") == "1" +FLA_CI_ENV = os.getenv("FLA_CI_ENV") == "1" +FLA_GDN_FIX_BT = os.getenv("FLA_GDN_FIX_BT", "0") == "1" + +SUPPRESS_LEVEL = int(os.getenv("GDN_RECOMPUTE_SUPPRESS_LEVEL", "0")) + + +def tensor_cache( + fn: Callable[..., torch.Tensor]) -> Callable[..., torch.Tensor]: + """ + A decorator that caches the most recent results of a function with tensor inputs. + + This decorator will store the output of the decorated function for the most recent set of input tensors. + The cache is limited to a fixed size (default is 4). When the cache is full, the oldest entry will be removed. + + Args: + fn (Callable[..., torch.Tensor]): + The function to be decorated. It should take tensor inputs and return tensor outputs. + + Returns: + Callable[..., torch.Tensor]: + A wrapped version of the input function with single-entry caching. + """ + + cache_entries: tuple[Optional[tuple], Optional[dict], Any] = [] + cache_size = 4 + + @functools.wraps(fn) + def wrapper(*args: Any, **kwargs: Any) -> Any: + nonlocal cache_entries, cache_size + for i, entry in enumerate(cache_entries): + last_args, last_kwargs, last_result = entry + if len(args) == len(last_args) and len(kwargs) == len(last_kwargs) \ + and all(a is b for a, b in zip(args, last_args)) \ + and all(k in last_kwargs and v is last_kwargs[k] for k, v in kwargs.items()): + cache_entries = cache_entries[:i] + cache_entries[i + 1:] + [ + (args, kwargs, last_result) + ] + return last_result + + result = fn(*args, **kwargs) + + if len(cache_entries) >= cache_size: + cache_entries = cache_entries[1:] + cache_entries.append((args, kwargs, result)) + return result + + return wrapper + + +def input_guard( + fn: Callable[..., torch.Tensor]) -> Callable[..., torch.Tensor]: + """ + A decorator to make sure all input tensors are contiguous and set the device based on input tensors. + """ + + @functools.wraps(fn) + def wrapper(*args, **kwargs): + contiguous_args = (i if not isinstance(i, torch.Tensor) else + i.contiguous() for i in args) + contiguous_kwargs = { + k: (v if not isinstance(v, torch.Tensor) else v.contiguous()) + for k, v in kwargs.items() + } + + tensor = None + for arg in args: + if isinstance(arg, torch.Tensor): + tensor = arg + break + if tensor is None: + for value in kwargs.values(): + if isinstance(value, torch.Tensor): + tensor = value + break + + if tensor is not None: + ctx = torch.cuda.device(tensor.device.index) + else: + ctx = contextlib.nullcontext() + + with ctx: + return fn(*contiguous_args, **contiguous_kwargs) + + return wrapper + + +@functools.cache +def get_available_device() -> str: + try: + return triton.runtime.driver.active.get_current_target().backend + except BaseException: + return 'cpu' + + +@functools.cache +def _check_platform() -> Literal['nvidia', 'amd', 'intel', 'musa']: + device = get_available_device() + mapping = { + "cuda": "nvidia", + "hip": "amd", + "xpu": "intel", + } + # return the mapped value, or the original if not found + return mapping.get(device, device) + + +# For AMD GPUs, the triton backend is 'hip', while for Nvidia GPUs, the triton backend is 'cuda'. +# However, the torch backend is 'cuda' for both Nvidia and AMD GPUs. +# Therefore, we need to check the triton backend to determine the actual GPU vendor. +device = get_available_device() if get_available_device() != 'hip' else 'cuda' +device_torch_lib = getattr(torch, device) +device_platform = _check_platform() + +is_amd = (device_platform == 'amd') +is_intel = (device_platform == 'nvidia') +is_nvidia = (device_platform == 'nvidia') +is_intel_alchemist = (is_intel + and 'Intel(R) Arc(TM) A' in torch.xpu.get_device_name(0)) +is_nvidia_hopper = (is_nvidia + and ('NVIDIA H' in torch.cuda.get_device_name(0) + or torch.cuda.get_device_capability()[0] >= 9)) +use_cuda_graph = (is_nvidia + and os.environ.get('FLA_USE_CUDA_GRAPH', '0') == '1') + + +def get_all_max_shared_mem(): + try: + return [ + triton.runtime.driver.active.utils.get_device_properties(i) + ['max_shared_mem'] for i in range(device_torch_lib.device_count()) + ] + except BaseException: + return [-1] + + +class Backend(Enum): + ADA = 101376 # RTX 4090 + AMPERE = 166912 # A100 + HOPPER = 232448 # H100 + DEFAULT = 102400 # Default + + @classmethod + def get_shared_memory(cls, arch: str) -> int: + try: + return cls[arch.upper()].value + except KeyError: + return cls.DEFAULT.value + + +@functools.cache +def check_shared_mem(arch: str = "none", tensor_idx: int = 0) -> bool: + try: + device_shared_mem_list = get_all_max_shared_mem() + max_shared_memory = device_shared_mem_list[tensor_idx] + return max_shared_memory >= Backend.get_shared_memory(arch) + except Exception: + return False diff --git a/vllm_kunlun/ops/fla/wy_fast.py b/vllm_kunlun/ops/fla/wy_fast.py new file mode 100644 index 0000000..f3b3b7a --- /dev/null +++ b/vllm_kunlun/ops/fla/wy_fast.py @@ -0,0 +1,247 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang + +# ruff: noqa: E501 +from typing import Optional + +import torch + +from vllm.triton_utils import tl, triton + +from .index import prepare_chunk_indices + +RESOLUTION = { + torch.bool: 0, + torch.int16: 0, + torch.int32: 0, + torch.int64: 0, + torch.float16: 1e-3, + torch.float32: 1.3e-6, + torch.bfloat16: 0.016, + torch.complex32: 1e-3, + torch.complex64: 1.3e-6, +} + +def assert_close(res, ref, dtype, equal_nan=False, reduce_dim=1): + assert res.dtype == dtype + ref = ref.to(dtype) + atol = 1e-3 * reduce_dim + rtol = RESOLUTION[dtype] + torch.testing.assert_close(res, ref, atol=atol, rtol=rtol, equal_nan=equal_nan) + +@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None}) +# @triton.autotune( +# configs=[ +# triton.Config({}, num_warps=num_warps, num_stages=num_stages) +# for num_warps in [2, 4, 8] +# for num_stages in [2, 3, 4] +# ], +# key=["H", "K", "V", "BT", "BK", "BV", "IS_VARLEN"], +# ) +@triton.jit(do_not_specialize=["T"]) +def recompute_u_fwd_kernel( + k, + v, + beta, + w, + u, + A, + g, + cu_seqlens, + chunk_indices, + T, + H: tl.constexpr, + Hg: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + BT: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + IS_VARLEN: tl.constexpr, +): + i_t, i_bh = tl.program_id(0), tl.program_id(1) + i_b, i_h = i_bh // H, i_bh % H + if IS_VARLEN: + i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load( + chunk_indices + i_t * 2 + 1 + ).to(tl.int32) + bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load( + cu_seqlens + i_n + 1 + ).to(tl.int32) + T = eos - bos + else: + bos, eos = i_b * T, i_b * T + T + p_beta = tl.make_block_ptr( + beta + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,) + ) + p_g = tl.make_block_ptr(g + (bos * H + i_h), (T,), (H,), (i_t * BT,), (BT,), (0,)) + p_A = tl.make_block_ptr( + A + (bos * H + i_h) * BT, (T, BT), (H * BT, 1), (i_t * BT, 0), (BT, BT), (1, 0) + ) + b_beta = tl.load(p_beta, boundary_check=(0,)) + b_A = tl.load(p_A, boundary_check=(0, 1)) + + for i_v in range(tl.cdiv(V, BV)): + p_v = tl.make_block_ptr( + v + (bos * H + i_h) * V, + (T, V), + (H * V, 1), + (i_t * BT, i_v * BV), + (BT, BV), + (1, 0), + ) + p_u = tl.make_block_ptr( + u + (bos * H + i_h) * V, + (T, V), + (H * V, 1), + (i_t * BT, i_v * BV), + (BT, BV), + (1, 0), + ) + b_v = tl.load(p_v, boundary_check=(0, 1)) + b_vb = (b_v * b_beta[:, None]).to(b_v.dtype) + b_u = tl.dot(b_A, b_vb, allow_tf32=False) + tl.store(p_u, b_u.to(p_u.dtype.element_ty), boundary_check=(0, 1)) + + + +@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None}) +# @triton.autotune( +# configs=[ +# triton.Config({}, num_warps=num_warps, num_stages=num_stages) +# for num_warps in [2, 4, 8] +# for num_stages in [2, 3, 4] +# ], +# key=["H", "K", "V", "BT", "BK", "BV", "IS_VARLEN"], +# ) +@triton.jit(do_not_specialize=["T"]) +def recompute_w_fwd_kernel( + k, + v, + beta, + w, + u, + A, + g, + cu_seqlens, + chunk_indices, + T, + H: tl.constexpr, + Hg: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + BT: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + IS_VARLEN: tl.constexpr, +): + i_t, i_bh = tl.program_id(0), tl.program_id(1) + i_b, i_h = i_bh // H, i_bh % H + if IS_VARLEN: + i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load( + chunk_indices + i_t * 2 + 1 + ).to(tl.int32) + bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load( + cu_seqlens + i_n + 1 + ).to(tl.int32) + T = eos - bos + else: + bos, eos = i_b * T, i_b * T + T + p_beta = tl.make_block_ptr( + beta + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,) + ) + p_g = tl.make_block_ptr(g + (bos * H + i_h), (T,), (H,), (i_t * BT,), (BT,), (0,)) + p_A = tl.make_block_ptr( + A + (bos * H + i_h) * BT, (T, BT), (H * BT, 1), (i_t * BT, 0), (BT, BT), (1, 0) + ) + b_beta = tl.load(p_beta, boundary_check=(0,)) + b_A = tl.load(p_A, boundary_check=(0, 1)) + b_g = tl.exp(tl.load(p_g, boundary_check=(0,))) + + for i_k in range(tl.cdiv(K, BK)): + p_k = tl.make_block_ptr( + k + (bos * Hg + i_h // (H // Hg)) * K, + (T, K), + (Hg * K, 1), + (i_t * BT, i_k * BK), + (BT, BK), + (1, 0), + ) + p_w = tl.make_block_ptr( + w + (bos * H + i_h) * K, + (T, K), + (H * K, 1), + (i_t * BT, i_k * BK), + (BT, BK), + (1, 0), + ) + b_k = tl.load(p_k, boundary_check=(0, 1)) + b_kb = (b_k * b_beta[:, None] * b_g[:, None]).to(b_k.dtype) + b_w = tl.dot(b_A, b_kb) + tl.store(p_w, b_w.to(p_w.dtype.element_ty), boundary_check=(0, 1)) + + +def recompute_w_u_fwd( + k: torch.Tensor, + v: torch.Tensor, + beta: torch.Tensor, + g_cumsum: torch.Tensor, + A: torch.Tensor, + cu_seqlens: Optional[torch.LongTensor], +) -> tuple[torch.Tensor, torch.Tensor]: + B, T, Hg, K, V = *k.shape, v.shape[-1] + H = v.shape[-2] + BT = A.shape[-1] + + chunk_indices = prepare_chunk_indices( + cu_seqlens, BT) if cu_seqlens is not None else None + NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices) + BK = 64 + BV = 64 + u = torch.empty_like(v) + w = k.new_empty(B, T, H, K) + recompute_u_fwd_kernel[(NT, B * H)]( + k=k, + v=v, + beta=beta, + w=w, + u=u, + A=A, + g=g_cumsum, + cu_seqlens=cu_seqlens, + chunk_indices=chunk_indices, + T=T, + H=H, + Hg=Hg, + K=K, + V=V, + BT=BT, + BK=BK, + BV=BV, + ) + recompute_w_fwd_kernel[(NT, B * H)]( + k=k, + v=v, + beta=beta, + w=w, + u=u, + A=A, + g=g_cumsum, + cu_seqlens=cu_seqlens, + chunk_indices=chunk_indices, + T=T, + H=H, + Hg=Hg, + K=K, + V=V, + BT=BT, + BK=BK, + BV=BV, + ) + return w, u \ No newline at end of file diff --git a/vllm_kunlun/ops/fused_moe/layer.py b/vllm_kunlun/ops/fused_moe/layer.py index 33447e6..9f01f70 100644 --- a/vllm_kunlun/ops/fused_moe/layer.py +++ b/vllm_kunlun/ops/fused_moe/layer.py @@ -1,29 +1,13 @@ -# -# Copyright (c) 2025 Baidu, Inc. All Rights Reserved. -# Copyright 2023 The vLLM team. -# Author: Dong Xinyu, Chen Zhennan, Bao Qian, Yuan Jizhong -# Email: dongxinyu03@baidu.com -# This file is a part of the vllm-kunlun project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. """layer.py""" import torch +import os from typing import Callable, Optional import vllm.envs as envs from vllm.config import get_current_vllm_config from vllm.forward_context import ForwardContext, get_forward_context from vllm.distributed import get_ep_group +from vllm.distributed.eplb.eplb_state import EplbState from vllm.model_executor.layers.fused_moe import FusedMoE as VllmFusedMoE from vllm.model_executor.layers.fused_moe import FusedMoEMethodBase as VllmFusedMoEMethodBase @@ -101,7 +85,6 @@ class UnquantizedFusedMoEMethod(VllmUnquantizedFusedMoEMethod): ) -> torch.Tensor: """forward_kunlun""" from vllm_kunlun.ops._kunlun_ops import KunlunOps as ops - if self.moe.use_ep: return ops.fused_moe_ep(x, layer.w13_weight, @@ -116,6 +99,96 @@ class UnquantizedFusedMoEMethod(VllmUnquantizedFusedMoEMethod): num_expert_group=num_expert_group, topk_group=topk_group ) + # fused_moe do not support expert number > 400 + elif layer.local_num_experts > 400: + hidden_states = x + global_num_experts = linear_weights.shape[0] + M, N = hidden_states.shape + hidden_dim = layer.w2_weight.shape[1] + normed_score = torch.empty(M, + top_k, + dtype=torch.float32, + device=hidden_states.device) + topk_ids = torch.empty(M, + top_k, + dtype=torch.int32, + device=hidden_states.device) + num_blocks = 12 + block_statistic = torch.zeros( + num_blocks, global_num_experts, dtype=torch.int32, device=hidden_states.device + ) + + router_logits = router_logits.float() + torch.ops._C.moe_softmax_topk_norm( + x=router_logits, + normed_score=normed_score, + topk_index=topk_ids, + block_statistic=None, + stable=True) + + moe_expand = torch.empty((M * top_k, N), dtype=hidden_states.dtype, device=hidden_states.device) # [M, top_k, N], float + expert_m = torch.zeros(global_num_experts, dtype=torch.int32, device=hidden_states.device) # [E] + sorted_tokens_num_lod = torch.zeros(global_num_experts + 1, dtype=torch.int32, device=hidden_states.device) # [E+1] + sorted_tokens_idx = torch.zeros(M * top_k, dtype=torch.int32, device=hidden_states.device) + + torch.ops._C.gen_block_statistic(topk_ids,block_statistic) + + torch.ops._C.moe_pre_sorted( + x=hidden_states, + topk_index=topk_ids, + block_statistic=block_statistic, + moe_expand=moe_expand, + moe_index=sorted_tokens_idx, + expert_m=expert_m, + sorted_tokens_num_lod=sorted_tokens_num_lod) + + y = torch.empty(M,top_k, + layer.w13_weight.shape[1], + dtype=hidden_states.dtype, + device=hidden_states.device) + + moe_expand = moe_expand.view(M * top_k, hidden_dim) + + torch.ops._C.moe_fc( + x=moe_expand, + weight=layer.w13_weight, + sorted_tokens_num_lod=sorted_tokens_num_lod, + sorted_tokens_idx=sorted_tokens_idx, + moe_topk=top_k, + y=y) + + d = y.shape[-1] // 2 + output_shape = (y.shape[:-1] + (d, )) + out1 = torch.empty(output_shape, dtype=y.dtype, device=y.device) + torch.ops._C.swiglu(y, out1) + + out = torch.empty(M,top_k, + layer.w2_weight.shape[1], + dtype=hidden_states.dtype, + device=hidden_states.device) + + out1 = out1.reshape(-1, out1.shape[-1]) + + torch.ops._C.moe_fc( + x=out1, + weight=layer.w2_weight, + sorted_tokens_num_lod=sorted_tokens_num_lod, + sorted_tokens_idx=sorted_tokens_idx, + moe_topk=top_k, + y=out) + + dequant_scale = torch.ones([M, top_k], dtype = torch.float32, device=out.device) + output = torch.empty([M, N], dtype=hidden_states.dtype, device=hidden_states.device) + sorted_tokens_idx = sorted_tokens_idx.view(M, top_k) + + torch.ops._C.moe_post( + x=out, + moe_index=sorted_tokens_idx, + normed_scale=normed_score, + dequant_scale=dequant_scale, + y=output + ) + return output else: return ops.fused_moe(x, layer.w13_weight, @@ -155,6 +228,7 @@ class FusedMoE(VllmFusedMoE): activation: str = "silu", enable_eplb: bool = False, num_redundant_experts: int = 0, + is_sequence_parallel=False, ): super().__init__( num_experts=num_experts, # Global number of experts @@ -189,7 +263,7 @@ class FusedMoE(VllmFusedMoE): # since model_config is not set in the pytest test. model_dtype = params_dtype - moe = FusedMoEConfig.make( + moe = FusedMoEConfig( num_experts=self.global_num_experts, experts_per_token=top_k, hidden_dim=hidden_size, @@ -197,7 +271,7 @@ class FusedMoE(VllmFusedMoE): moe_parallel_config=self.moe_parallel_config, in_dtype=model_dtype, max_num_tokens=envs.VLLM_MOE_DP_CHUNK_SIZE, - quant_config=quant_config, + # quant_config=quant_config, ) self.moe_config = moe self.quant_config = quant_config @@ -307,4 +381,35 @@ class FusedMoE(VllmFusedMoE): final_hidden_states = self.maybe_all_reduce_tensor_model_parallel( final_hidden_states) - return final_hidden_states \ No newline at end of file + return final_hidden_states + @classmethod + def make_expert_params_mapping( + cls, + ckpt_gate_proj_name: str, + ckpt_down_proj_name: str, + ckpt_up_proj_name: str, + num_experts: int, + num_redundant_experts: int = 0) -> list[tuple[str, str, int, str]]: + + num_physical_experts = num_experts + num_redundant_experts + + # In the returned mapping: + # - `expert_id` is the physical expert id + # - `weight_name` contains the weight name of the logical expert + # So that we should map the expert id to logical in `weight_name` + physical_to_logical_map = \ + EplbState.build_initial_global_physical_to_logical_map( + num_experts, num_redundant_experts) + + return [ + # (param_name, weight_name, expert_id, shard_id) + ("experts.w13_" if weight_name + in [ckpt_gate_proj_name, ckpt_up_proj_name] else "experts.w2_", + f"experts.{physical_to_logical_map[expert_id]}.{weight_name}.", + expert_id, shard_id) for expert_id in range(num_physical_experts) + for shard_id, weight_name in [ + ("w1", ckpt_gate_proj_name), + ("w2", ckpt_down_proj_name), + ("w3", ckpt_up_proj_name), + ] + ] diff --git a/vllm_kunlun/ops/layernorm.py b/vllm_kunlun/ops/layernorm.py index e332067..badaba1 100644 --- a/vllm_kunlun/ops/layernorm.py +++ b/vllm_kunlun/ops/layernorm.py @@ -12,49 +12,101 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# This file is a part of the vllm-kunlun project. +# This file is a part of the vllm-ascend project. # import torch from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.layernorm import GemmaRMSNorm as OriGemmaRMSNorm +from vllm.model_executor.layers import layernorm from typing import Optional, Union import xtorch_ops - def vllm_kunlun_forward_cuda( - self, - x: torch.Tensor, - residual: Optional[torch.Tensor] = None, -) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: - """forward_cuda""" - if x.is_contiguous() == False: - # kunlun does not support uncontiguous input and they do not think it is a bug - # so we must make it contiguous() manually - x = x.contiguous() - if self.variance_size_override is not None: - return self.forward_native(x, residual) + self, + x: torch.Tensor, + residual: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: + """forward_cuda""" + if x.is_contiguous() == False: + # kunlun does not support uncontiguous input and they do not think it is a bug + # so we must make it contiguous() manually + x = x.contiguous() + if self.variance_size_override is not None: + return self.forward_native(x, residual) - if residual is not None: - # residual_output = torch.empty_like(residual) - torch.ops._C.add_rmsnorm( + + if residual is not None: + # residual_output = torch.empty_like(residual) + torch.ops._C.add_rmsnorm( + x, + residual, + residual_output=residual, + weight=self.weight.data, + eps=self.variance_epsilon, + output=x + ) + return x, residual + out = torch.empty_like(x) + torch.ops._C.rmsnorm( x, - residual, - residual_output=residual, - weight=self.weight.data, - eps=self.variance_epsilon, - output=x, + self.weight.data, + out, + self.variance_epsilon, ) - return x, residual - out = torch.empty_like(x) - torch.ops._C.rmsnorm( - x, - self.weight.data, - out, - self.variance_epsilon, - ) - return out + return out + + +class KunlunGemmaRMSNorm(OriGemmaRMSNorm): + @staticmethod + def forward_xpu( + weight: torch.Tensor, + variance_epsilon: float, + x: torch.Tensor, + residual: Optional[torch.Tensor], + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: + if x.is_contiguous() == False: + # kunlun does not support uncontiguous input and they do not think it is a bug + # so we must make it contiguous() manually + x = x.contiguous() + + if residual is not None: + torch.ops._C.add_rmsnorm( + x, + residual, + residual_output=residual, + weight=weight+1, + eps=variance_epsilon, + output=x + ) + return x, residual + + out = torch.empty_like(x) + torch.ops._C.rmsnorm( + x, + weight+1, + out, + variance_epsilon, + ) + return out + + def forward_cuda( + self, + x: torch.Tensor, + residual: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: + if torch.compiler.is_compiling(): + self.forward_static = self.forward_xpu # only use in cudagraph + return self.forward_native(x, residual) + + if not getattr(self, "_is_compiled", False): + self.forward_static = torch.compile( # type: ignore + self.forward_static, backend="aot_eager") + self._is_compiled = True + return self.forward_native(x, residual) RMSNorm.forward_cuda = vllm_kunlun_forward_cuda RMSNorm.forward = vllm_kunlun_forward_cuda +layernorm.GemmaRMSNorm = KunlunGemmaRMSNorm \ No newline at end of file diff --git a/vllm_kunlun/ops/linear.py b/vllm_kunlun/ops/linear.py index 3dd3296..cd6c1ba 100644 --- a/vllm_kunlun/ops/linear.py +++ b/vllm_kunlun/ops/linear.py @@ -1,13 +1,331 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import itertools +from abc import abstractmethod +from typing import Any, Literal, Optional, Union + import torch import torch.nn as nn -from torch.nn.parameter import Parameter +from torch.nn.parameter import Parameter, UninitializedParameter -from vllm.model_executor.layers.linear import ReplicatedLinear as VllmReplicatedLinear +from vllm.distributed import (divide, get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + split_tensor_along_last_dim, + tensor_model_parallel_all_gather, + tensor_model_parallel_all_reduce) +from vllm.logger import init_logger +from vllm.model_executor.custom_op import CustomOp +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, QuantizeMethodBase) +from vllm.model_executor.layers.utils import dispatch_unquantized_gemm +# yapf: disable +from vllm.model_executor.parameter import (BasevLLMParameter, + BlockQuantScaleParameter, + PackedColumnParameter, + PackedvLLMParameter, + PerTensorScaleParameter, + RowvLLMParameter) +# yapf: enable +from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform -class ReplicatedLinear(VllmReplicatedLinear): - """Replicated linear layer""" +logger = init_logger(__name__) + +WEIGHT_LOADER_V2_SUPPORTED = [ + "CompressedTensorsLinearMethod", + "CompressedTensorsLinearTransformMethod", + "BitBLASLinearMethod", + "GPTQBitBLASLinearMethod", + "AWQMarlinLinearMethod", + "AWQLinearMethod", + "GPTQMarlinLinearMethod", + "Fp8LinearMethod", + "MarlinLinearMethod", + "GPTQMarlin24LinearMethod", + "TPUInt8LinearMethod", + "GPTQLinearMethod", + "FBGEMMFp8LinearMethod", + "ModelOptFp8LinearMethod", + "IPEXAWQLinearMethod", + "IPEXGPTQLinearMethod", + "HQQMarlinMethod", + "QuarkLinearMethod", + "ModelOptNvFp4LinearMethod", + "PetitNvFp4LinearMethod", +] + + +def adjust_bitblas_shard(param, shard_size, shard_offset): + bitblas_tile_size = getattr(param, "bitblas_tile_size", None) + if bitblas_tile_size is not None: + return (shard_size // bitblas_tile_size, + shard_offset // bitblas_tile_size) + + return shard_size, shard_offset + + +def adjust_marlin_shard(param, shard_size, shard_offset): + marlin_tile_size = getattr(param, "marlin_tile_size", None) + if marlin_tile_size is None: + return shard_size, shard_offset + + return shard_size * marlin_tile_size, shard_offset * marlin_tile_size + + +def adjust_bitsandbytes_4bit_shard(param: Parameter, + shard_offsets: dict[str, tuple[int, int]], + loaded_shard_id: str) -> tuple[int, int]: + """Adjust the quantization offsets and sizes for BitsAndBytes sharding.""" + + total, _ = shard_offsets["total"] + orig_offset, orig_size = shard_offsets[loaded_shard_id] + + quantized_total = param.data.shape[0] + quantized_offset = orig_offset * quantized_total // total + quantized_size = orig_size * quantized_total // total + + return quantized_size, quantized_offset + + +def adjust_scalar_to_fused_array(param, loaded_weight, shard_id): + """For fused modules (QKV and MLP) we have an array of length + N that holds 1 scale for each "logical" matrix. So the param + is an array of length N. The loaded_weight corresponds to + one of the shards on disk. Here, we slice the param based on + the shard_id for loading. + """ + qkv_idxs = {"q": 0, "k": 1, "v": 2} + + if isinstance(shard_id, str): + shard_id = qkv_idxs[shard_id] + elif not isinstance(shard_id, int): + raise ValueError(f"Unknown Shard Id {shard_id}") + + # AutoFP8 scales do not have a shape + # compressed-tensors scales do have a shape + if len(loaded_weight.shape) != 0: + assert loaded_weight.shape[0] == 1 + loaded_weight = loaded_weight[0] + + return param[shard_id], loaded_weight + + +# TODO(Isotr0py): We might need a more flexible structure to handle +# bitsandbytes shard offsets. +def left_shift_bitsandbytes_4bit_shard(bnb_weight_attrs: dict[str, Any]): + """ + Separate the BitsAndBytes 4-bit shard. + + For example, given bnb weight attributes as below: + { + 'bnb_shard_offsets': array([0, 4, 8, 16]), + 'bnb_quant_state': {0: ..., 1: ..., 2: ...}, + } + + The function will return: + { + 'bnb_shard_offsets': array([0, 4]), + 'bnb_quant_state': {0: ...}, + } + and + { + 'bnb_shard_offsets': array([0, 4, 12]), + 'bnb_quant_state': {0: ..., 1: ...}, + } + """ + shard_offsets = bnb_weight_attrs["bnb_shard_offsets"] + offset_l = shard_offsets[:2] + offset_r = shard_offsets[1:] - shard_offsets[1] + quant_state_l = {0: bnb_weight_attrs["bnb_quant_state"][0]} + quant_state_r = { + i - 1: bnb_weight_attrs["bnb_quant_state"][i] + for i in range(1, + len(shard_offsets) - 1) + } + left = dict(bnb_shard_offsets=offset_l, bnb_quant_state=quant_state_l) + right = dict(bnb_shard_offsets=offset_r, bnb_quant_state=quant_state_r) + return left, right + + +class LinearMethodBase(QuantizeMethodBase): + """Base class for different (maybe quantized) linear methods.""" + + @abstractmethod + def create_weights(self, layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], input_size: int, + output_size: int, params_dtype: torch.dtype, + **extra_weight_attrs): + """Create weights for a linear layer. + The weights will be set as attributes of the layer. + + Args: + layer: The layer that is using the LinearMethodBase factory. + input_size_per_partition: Size of the weight input dim on rank X. + output_partition_sizes: Sizes of the output dim of each logical + weight on rank X. E.g., output_partition_sizes for QKVLinear + is a list contains the width of Wq, Wk, Wv on rank X. + input_size: Size of the input dim of the weight across all ranks. + output_size: Size of the output dim of the weight across all ranks. + params_dtype: Datatype of the parameters. + """ + raise NotImplementedError + + @abstractmethod + def apply(self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + """Apply the weights in layer to the input tensor. + Expects create_weights to have been called before on the layer.""" + raise NotImplementedError + + +class UnquantizedLinearMethod(LinearMethodBase): + """Linear method without quantization.""" + + def create_weights(self, layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], input_size: int, + output_size: int, params_dtype: torch.dtype, + **extra_weight_attrs): + weight = Parameter(torch.empty(sum(output_partition_sizes), + input_size_per_partition, + dtype=params_dtype), + requires_grad=False) + set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) + layer.register_parameter("weight", weight) + set_weight_attrs(weight, extra_weight_attrs) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + if current_platform.is_cpu(): + from vllm.model_executor.layers.utils import ( + dispatch_cpu_unquantized_gemm) + dispatch_cpu_unquantized_gemm(layer, remove_weight=True) + + def apply(self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + + return dispatch_unquantized_gemm()(layer, x, layer.weight, bias) + + +class LinearBase(CustomOp): + """Base linear layer. + + Args: + input_size: input dimension of the linear layer. + output_size: output dimension of the linear layer. + skip_bias_add: If true, skip adding bias but instead return it. + params_dtype: Data type for the parameters. + quant_config: Quantization configure. + prefix: Prefix for parameter names. + return_bias: If true, return bias together with outputs in forward pass. + disable_tp: If true, tensor parallelism will be disabled for this layer. + """ + + def __init__( + self, + input_size: int, + output_size: int, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + *, + return_bias: bool = True, + disable_tp: bool = False, + ): + super().__init__() + + # Keep input parameters + self.input_size = input_size + self.output_size = output_size + self.skip_bias_add = skip_bias_add + if params_dtype is None: + params_dtype = torch.get_default_dtype() + self.params_dtype = params_dtype + self.quant_config = quant_config + self.prefix = prefix + if quant_config is None: + self.quant_method: Optional[ + QuantizeMethodBase] = UnquantizedLinearMethod() + else: + self.quant_method = quant_config.get_quant_method(self, + prefix=prefix) + self.return_bias = return_bias + self.disable_tp = disable_tp + self.tp_rank = (get_tensor_model_parallel_rank() + if not disable_tp else 0) + self.tp_size = (get_tensor_model_parallel_world_size() + if not disable_tp else 1) + + def update_param_tp_status(self): + for param in self.parameters(): + if isinstance(param, BasevLLMParameter): + param.tp_rank = self.tp_rank + param.tp_size = self.tp_size + + +@CustomOp.register("replicated_linear_kulnun") +class ReplicatedLinear(LinearBase): + """Replicated linear layer. + + Args: + input_size: input dimension of the linear layer. + output_size: output dimension of the linear layer. + bias: If true, add bias. + skip_bias_add: If true, skip adding bias but instead return it. + params_dtype: Data type for the parameters. + quant_config: Quantization configure. + prefix: The name of the layer in the state dict, including all parents + (e.g. model.layers.0.qkv_proj) + return_bias: If true, return bias together with outputs in forward pass. + disable_tp: Take no effect for replicated linear layers. + """ + + def __init__( + self, + input_size: int, + output_size: int, + bias: bool = True, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + *, + return_bias: bool = True, + disable_tp: bool = False, + ): + super().__init__(input_size, + output_size, + skip_bias_add, + params_dtype, + quant_config, + prefix=prefix, + return_bias=return_bias, + disable_tp=disable_tp) + + # All the linear layer supports quant method. + assert self.quant_method is not None + self.quant_method.create_weights(self, + self.input_size, [self.output_size], + self.input_size, + self.output_size, + self.params_dtype, + weight_loader=self.weight_loader) + + if bias: + self.bias = Parameter( + torch.empty(self.output_size, dtype=self.params_dtype)) + set_weight_attrs(self.bias, { + "output_dim": 0, + "weight_loader": self.weight_loader, + }) + else: + self.register_parameter("bias", None) def get_weights(self): """get_weights""" @@ -21,4 +339,1246 @@ class ReplicatedLinear(VllmReplicatedLinear): """get_weights_half""" if hasattr(self, 'kunlun_linear_weights_half'): return self.kunlun_linear_weights_half - weights = torch.nn.Parameter(self.weight.to(torch.float16)) \ No newline at end of file + weights = torch.nn.Parameter(self.weight.to(torch.float16)) + + def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): + # If the weight on disk does not have a shape, give it one + # (such scales for AutoFp8). + # Special case for GGUF + + is_gguf_weight = getattr(param, "is_gguf_weight", False) + is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False) + if is_gguf_weight_type: + param.weight_type = loaded_weight.item() + + # Materialize GGUF UninitializedParameter + if is_gguf_weight and isinstance(param, UninitializedParameter): + param.materialize(loaded_weight.shape, dtype=loaded_weight.dtype) + + if len(loaded_weight.shape) == 0: + loaded_weight = loaded_weight.reshape(1) + + assert param.size() == loaded_weight.size(), ( + f"Tried to load weights of size {loaded_weight.size()}" + f"to a parameter of size {param.size()}") + param.data.copy_(loaded_weight) + + def forward( + self, x: torch.Tensor + ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]: + bias = self.bias if not self.skip_bias_add else None + assert self.quant_method is not None + + output = self.quant_method.apply(self, x, bias) + output_bias = self.bias if self.skip_bias_add else None + + if not self.return_bias: + return output + return output, output_bias + + def extra_repr(self) -> str: + s = f"in_features={self.input_size}" + s += f", output_features={self.output_size}" + s += f", bias={self.bias is not None}" + return s + + +@CustomOp.register("column_parallel_linear_kunlun") +class ColumnParallelLinear(LinearBase): + """Linear layer with column parallelism. + + The linear layer is defined as Y = XA + b. A is parallelized along + its second dimension as A = [A_1, ..., A_p]. + + Args: + input_size: first dimension of matrix A. + output_size: second dimension of matrix A. + bias: If true, add bias. + gather_output: If true, call all-gather on output and make Y available + to all GPUs, otherwise, every GPU will have its output + which is Y_i = XA_i + skip_bias_add: This was added to enable performance optimizations where + bias can be fused with other element-wise operations. we + skip adding bias but instead return it. + params_dtype: Data type for the parameters. + quant_config: Quantization configure. + output_sizes: list of output sizes packed into one output, like for QKV + the list would be size 3. + prefix: The name of the layer in the state dict, including all parents + (e.g. model.layers.0.qkv_proj) + return_bias: If true, return bias together with outputs in forward pass. + disable_tp: If true, weights matrix won't be sharded through tp rank. + """ + + def __init__( + self, + input_size: int, + output_size: int, + bias: bool = True, + gather_output: bool = False, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + quant_config: Optional[QuantizationConfig] = None, + output_sizes: Optional[list[int]] = None, + prefix: str = "", + *, + return_bias: bool = True, + disable_tp: bool = False, + ): + # Divide the weight matrix along the last dimension. + self.tp_rank = (get_tensor_model_parallel_rank() + if not disable_tp else 0) + self.tp_size = (get_tensor_model_parallel_world_size() + if not disable_tp else 1) + self.input_size_per_partition = input_size + self.output_size_per_partition = divide(output_size, self.tp_size) + self.output_partition_sizes = [self.output_size_per_partition] + # If QKV or MergedColumn, use output size of each partition. + if hasattr(self, "output_sizes"): + self.output_partition_sizes = [ + divide(output_size, self.tp_size) + for output_size in self.output_sizes + ] + + super().__init__(input_size, + output_size, + skip_bias_add, + params_dtype, + quant_config, + prefix, + return_bias=return_bias, + disable_tp=disable_tp) + + self.gather_output = gather_output + + if output_sizes is None: + output_sizes = [output_size] + + assert self.quant_method is not None + self.quant_method.create_weights( + layer=self, + input_size_per_partition=self.input_size_per_partition, + output_partition_sizes=self.output_partition_sizes, + input_size=self.input_size, + output_size=self.output_size, + params_dtype=self.params_dtype, + weight_loader=( + self.weight_loader_v2 if self.quant_method.__class__.__name__ + in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader)) + if bias: + self.bias = Parameter( + torch.empty(self.output_size_per_partition, + dtype=params_dtype)) + set_weight_attrs(self.bias, { + "output_dim": 0, + "weight_loader": self.weight_loader, + }) + else: + self.register_parameter("bias", None) + self.update_param_tp_status() + + def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): + + output_dim = getattr(param, "output_dim", None) + + is_sharded_weight = getattr(param, "is_sharded_weight", False) + use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + # bitsandbytes loads the weights of the specific portion + # no need to narrow + is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit + + # Special case for GGUF + is_gguf_weight = getattr(param, "is_gguf_weight", False) + is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False) + if is_gguf_weight_type: + param.weight_type = loaded_weight.item() + + # Materialize GGUF UninitializedParameter + if is_gguf_weight and isinstance(param, UninitializedParameter): + final_shape = list(loaded_weight.shape) + if output_dim is not None: + assert final_shape[output_dim] % self.tp_size == 0 + final_shape[output_dim] = (final_shape[output_dim] // + self.tp_size) + param.materialize(final_shape, dtype=loaded_weight.dtype) + + param_data = param.data + if output_dim is not None and not is_sharded_weight: + shard_size = param_data.shape[output_dim] + start_idx = self.tp_rank * shard_size + loaded_weight = loaded_weight.narrow(output_dim, start_idx, + shard_size) + + # Special case for loading scales off disk, which often do not + # have a shape (such as in the case of AutoFP8). + if len(loaded_weight.shape) == 0: + loaded_weight = loaded_weight.reshape(1) + + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + + def weight_loader_v2(self, param: BasevLLMParameter, + loaded_weight: torch.Tensor): + # Special case for loading scales off disk, which often do not + # have a shape (such as in the case of AutoFP8). + if len(loaded_weight.shape) == 0: + assert loaded_weight.numel() == 1 + loaded_weight = loaded_weight.reshape(1) + param.load_column_parallel_weight(loaded_weight=loaded_weight) + + def forward( + self, input_ + ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]: + bias = self.bias if not self.skip_bias_add else None + + # Matrix multiply. + assert self.quant_method is not None + output_parallel = self.quant_method.apply(self, input_, bias) + + if self.gather_output and self.tp_size > 1: + # All-gather across the partitions. + output = tensor_model_parallel_all_gather(output_parallel) + else: + output = output_parallel + output_bias = self.bias if self.skip_bias_add else None + if not self.return_bias: + return output + return output, output_bias + + def extra_repr(self) -> str: + s = f"in_features={self.input_size}" + s += f", output_features={self.output_size_per_partition}" + s += f", bias={self.bias is not None}" + s += f", tp_size={self.tp_size}" + s += f", gather_output={self.gather_output}" + return s + + +class MergedColumnParallelLinear(ColumnParallelLinear): + """Packed linear layers with column parallelism. + + Similar to ColumnParallelLinear, but the weight matrix is concatenated + along the output dimension. When the weight matrix is loaded, the + different partitions are sharded separately. + + Args: + input_size: input dimension of the linear layer. + output_sizes: list of output dimensions of the linear layer. + bias: If true, add bias. + gather_output: If true, call all-gather on output and make the output + available to all GPUs, otherwise, every GPU will have + its own output. + skip_bias_add: This was added to enable performance optimizations where + bias can be fused with other element-wise operations. we + skip adding bias but instead return it. + params_dtype: Data type for the parameters. + quant_config: Quantization configure. + prefix: The name of the layer in the state dict, including all parents + (e.g. model.layers.0.qkv_proj) + return_bias: If true, return bias together with outputs in forward pass. + disable_tp: If true, all weights matrix won't be sharded, this layer + will be treated as a "Replicated" MergedLinear. + """ + + def __init__( + self, + input_size: int, + output_sizes: list[int], + bias: bool = True, + gather_output: bool = False, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + *, + return_bias: bool = True, + disable_tp: bool = False, + ): + self.output_sizes = output_sizes + self.tp_size = (get_tensor_model_parallel_world_size() + if not disable_tp else 1) + self.tp_rank = (get_tensor_model_parallel_rank() + if not disable_tp else 0) + + assert all(output_size % self.tp_size == 0 + for output_size in output_sizes) + super().__init__(input_size=input_size, + output_size=sum(output_sizes), + bias=bias, + gather_output=gather_output, + skip_bias_add=skip_bias_add, + params_dtype=params_dtype, + quant_config=quant_config, + prefix=prefix, + return_bias=return_bias, + disable_tp=disable_tp) + + def weight_loader(self, + param: Parameter, + loaded_weight: torch.Tensor, + loaded_shard_id: Optional[int] = None): + + # Special case for GGUF + # initialize GGUF param after we know the quantize type + is_gguf_weight = getattr(param, "is_gguf_weight", False) + is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False) + if is_gguf_weight_type: + if loaded_shard_id is not None: + param.data[loaded_shard_id].copy_(loaded_weight) + param.shard_weight_type[loaded_shard_id] = loaded_weight.item() + else: + param.shard_weight_type = { + i: loaded_weight.item() + for i, _ in enumerate(self.output_sizes) + } + return + + if is_gguf_weight: + + output_dim = getattr(param, "output_dim", None) + shard_size = loaded_weight.size(output_dim) // self.tp_size + start_idx = self.tp_rank * shard_size + + if loaded_shard_id is not None: + loaded_weight = loaded_weight.narrow(output_dim, start_idx, + shard_size) + param.shard_id.append(loaded_shard_id) + param.shard_id_map[loaded_shard_id] = len(param.data_container) + param.data_container.append(loaded_weight) + return + + param_data = param.data + output_dim = getattr(param, "output_dim", None) + # Special case for per-tensor scale to load scalar into fused array. + needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False) + + if loaded_shard_id is None: + # Loaded weight is already fused on disk (mlp). + # (e.g., Phi-3's gate_up_proj). + if output_dim is None: + if needs_scalar_to_array: + param_data, loaded_weight = adjust_scalar_to_fused_array( + param_data, loaded_weight, 0) + + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + return + current_shard_offset = 0 + use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", + False) + shard_offsets: list[tuple[int, int, int]] = [] + for i, output_size in enumerate(self.output_sizes): + shard_offsets.append((i, current_shard_offset, output_size)) + current_shard_offset += output_size + packed_dim = getattr(param, "packed_dim", None) + for shard_id, shard_offset, shard_size in shard_offsets: + # Special case for Quantization. + # If quantized, we need to adjust the offset and size to account + # for the packing. + if packed_dim == output_dim: + shard_size = shard_size // param.packed_factor + shard_offset = shard_offset // param.packed_factor + # Special case for Marlin. + shard_size, shard_offset = adjust_marlin_shard( + param, shard_size, shard_offset) + + shard_size, shard_offset = adjust_bitblas_shard( + param, shard_size, shard_offset) + + if use_bitsandbytes_4bit: + index = list(itertools.accumulate([0] + self.output_sizes)) + orig_offsets = { + str(i): (index[i], size) + for i, size in enumerate(self.output_sizes) + } + orig_offsets["total"] = (self.output_size, 0) + shard_size, shard_offset = adjust_bitsandbytes_4bit_shard( + param, orig_offsets, str(shard_id)) + + loaded_weight_shard = loaded_weight.narrow( + output_dim, shard_offset, shard_size) + self.weight_loader(param, loaded_weight_shard, shard_id) + return + + assert loaded_shard_id < len(self.output_sizes) + if output_dim is not None: + shard_offset = (sum(self.output_sizes[:loaded_shard_id]) // + self.tp_size) + shard_size = self.output_sizes[loaded_shard_id] // self.tp_size + # Special case for quantization. + # If quantized, we need to adjust the offset and size to account + # for the packing. + packed_dim = getattr(param, "packed_dim", None) + if packed_dim == output_dim: + shard_size = shard_size // param.packed_factor + shard_offset = shard_offset // param.packed_factor + # Special case for Marlin. + shard_size, shard_offset = adjust_marlin_shard( + param, shard_size, shard_offset) + shard_size, shard_offset = adjust_bitblas_shard( + param, shard_size, shard_offset) + + use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", + False) + is_sharded_weight = getattr(param, "is_sharded_weight", False) + # bitsandbytes loads the weights of the specific portion + # no need to narrow + is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit + + if use_bitsandbytes_4bit: + shard_size = loaded_weight.shape[output_dim] + shard_offset = loaded_weight.shape[output_dim] * \ + loaded_shard_id + + param_data = param_data.narrow(output_dim, shard_offset, + shard_size) + start_idx = self.tp_rank * shard_size + if not is_sharded_weight: + loaded_weight = loaded_weight.narrow(output_dim, start_idx, + shard_size) + # Special case for per-tensor scales in fused case. + elif needs_scalar_to_array: + param_data, loaded_weight = adjust_scalar_to_fused_array( + param_data, loaded_weight, loaded_shard_id) + + else: + ignore_warning = getattr(param, "ignore_warning", False) + if not ignore_warning: + logger.warning( + "Loading a weight without `output_dim` attribute in " + "MergedColumnParallelLinear, assume the weight is " + "the same for all partitions.") + + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + + def _load_fused_module_from_checkpoint(self, param: BasevLLMParameter, + loaded_weight: torch.Tensor): + """ + Handle special case for models where MLP layers are already + fused on disk. In this case, we have no shard id. This function + determines the shard id by splitting these layers and then calls + the weight loader using the shard id. + + An example of a model with these fused layers: + https://huggingface.co/microsoft/Phi-3-mini-4k-instruct + """ + + current_shard_offset = 0 + shard_offsets: list[tuple[int, int, int]] = [] + for i, output_size in enumerate(self.output_sizes): + shard_offsets.append((i, current_shard_offset, output_size)) + current_shard_offset += output_size + + for shard_id, shard_offset, shard_size in shard_offsets: + # Special case for Quantization. + # If quantized, we need to adjust the offset and size to account + # for the packing. + if isinstance(param, (PackedColumnParameter, PackedvLLMParameter + )) and param.packed_dim == param.output_dim: + shard_size, shard_offset = \ + param.adjust_shard_indexes_for_packing( + shard_size=shard_size, shard_offset=shard_offset) + + loaded_weight_shard = loaded_weight.narrow(param.output_dim, + shard_offset, + shard_size) + self.weight_loader_v2(param, loaded_weight_shard, shard_id) + + def weight_loader_v2(self, + param: BasevLLMParameter, + loaded_weight: torch.Tensor, + loaded_shard_id: Optional[int] = None): + if loaded_shard_id is None: + if isinstance(param, PerTensorScaleParameter): + param.load_merged_column_weight(loaded_weight=loaded_weight, + shard_id=0) + return + elif type(param) in (RowvLLMParameter, BasevLLMParameter): + param.load_merged_column_weight(loaded_weight=loaded_weight) + return + # TODO: @dsikka - move to parameter.py + self._load_fused_module_from_checkpoint(param, loaded_weight) + return + + assert loaded_shard_id < len(self.output_sizes) + + if isinstance(param, BlockQuantScaleParameter): + from vllm.model_executor.layers.quantization.fp8 import ( + Fp8LinearMethod, Fp8MoEMethod) + assert self.quant_method is not None + assert isinstance(self.quant_method, + (Fp8LinearMethod, Fp8MoEMethod)) + weight_block_size = self.quant_method.quant_config.weight_block_size + assert weight_block_size is not None + block_n, _ = weight_block_size[0], weight_block_size[1] + shard_offset = ( + (sum(self.output_sizes[:loaded_shard_id]) + block_n - 1) // + block_n) // self.tp_size + shard_size = ((self.output_sizes[loaded_shard_id] + block_n - 1) // + block_n // self.tp_size) + else: + shard_offset = sum( + self.output_sizes[:loaded_shard_id]) // self.tp_size + shard_size = self.output_sizes[loaded_shard_id] // self.tp_size + + param.load_merged_column_weight(loaded_weight=loaded_weight, + shard_id=loaded_shard_id, + shard_offset=shard_offset, + shard_size=shard_size, + tp_rank=self.tp_rank) + + +class QKVParallelLinear(ColumnParallelLinear): + """Linear layers for the attention's QKV transformation. + + Linear layers for the linear transformation of the query, key, and value + vectors in the attention layer. The weight matrix is concatenated along + the output dimension. The layer is parallelized along the head dimension. + When the number of key/value heads is smaller than the number of query + heads (e.g., multi-query/grouped-query attention), the key/value head may + be replicated while the query heads are partitioned. + + Args: + hidden_size: input hidden state size of the transformer. + head_size: size of each attention head. + total_num_heads: total number of attention query heads. + total_num_kv_heads: total number of attention key/value heads. If + None, assume total_num_kv_heads = total_num_heads. + bias: If true, add bias. + skip_bias_add: This was added to enable performance optimizations where + bias can be fused with other element-wise operations. we + skip adding bias but instead return it. + params_dtype: Data type for the parameters. + quant_config: Quantization configure. + prefix: The name of the layer in the state dict, including all parents + (e.g. model.layers.0.qkv_proj) + return_bias: If true, return bias together with outputs in forward pass. + disable_tp: If true, weights matrix won't be sharded through tp rank. + """ + + def __init__( + self, + hidden_size: int, + head_size: int, + total_num_heads: int, + total_num_kv_heads: Optional[int] = None, + bias: bool = True, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + *, + return_bias: bool = True, + disable_tp: bool = False, + ): + self.hidden_size = hidden_size + self.head_size = head_size + self.total_num_heads = total_num_heads + if total_num_kv_heads is None: + total_num_kv_heads = total_num_heads + self.total_num_kv_heads = total_num_kv_heads + # Divide the weight matrix along the last dimension. + tp_size = (get_tensor_model_parallel_world_size() + if not disable_tp else 1) + self.num_heads = divide(self.total_num_heads, tp_size) + if tp_size >= self.total_num_kv_heads: + self.num_kv_heads = 1 + self.num_kv_head_replicas = divide(tp_size, + self.total_num_kv_heads) + else: + self.num_kv_heads = divide(self.total_num_kv_heads, tp_size) + self.num_kv_head_replicas = 1 + input_size = self.hidden_size + output_size = (self.num_heads + + 2 * self.num_kv_heads) * tp_size * self.head_size + self.output_sizes = [ + self.num_heads * self.head_size * tp_size, # q_proj + self.num_kv_heads * self.head_size * tp_size, # k_proj + self.num_kv_heads * self.head_size * tp_size, # v_proj + ] + + super().__init__(input_size=input_size, + output_size=output_size, + bias=bias, + gather_output=False, + skip_bias_add=skip_bias_add, + params_dtype=params_dtype, + quant_config=quant_config, + prefix=prefix, + return_bias=return_bias, + disable_tp=disable_tp) + + def _get_shard_offset_mapping(self, loaded_shard_id: str): + shard_offset_mapping = { + "q": 0, + "k": self.num_heads * self.head_size, + "v": (self.num_heads + self.num_kv_heads) * self.head_size, + "total": (self.num_heads + 2 * self.num_kv_heads) * self.head_size + } + return shard_offset_mapping.get(loaded_shard_id) + + def _get_shard_size_mapping(self, loaded_shard_id: str): + shard_size_mapping = { + "q": self.num_heads * self.head_size, + "k": self.num_kv_heads * self.head_size, + "v": self.num_kv_heads * self.head_size, + } + return shard_size_mapping.get(loaded_shard_id) + + def _load_fused_module_from_checkpoint(self, param: BasevLLMParameter, + loaded_weight: torch.Tensor): + """ + Handle special case for models where QKV layers are already + fused on disk. In this case, we have no shard id. This function + determines the shard id by splitting these layers and then calls + the weight loader using the shard id. + + An example of a model with these fused layers: + https://huggingface.co/microsoft/Phi-3-mini-4k-instruct + """ + shard_offsets = [ + # (shard_id, shard_offset, shard_size) + ("q", 0, self.total_num_heads * self.head_size), + ("k", self.total_num_heads * self.head_size, + self.total_num_kv_heads * self.head_size), + ("v", + (self.total_num_heads + self.total_num_kv_heads) * self.head_size, + self.total_num_kv_heads * self.head_size), + ] + + for shard_id, shard_offset, shard_size in shard_offsets: + # Special case for Quantization. + # If quantized, we need to adjust the offset and size to account + # for the packing. + if isinstance(param, (PackedColumnParameter, PackedvLLMParameter + )) and param.packed_dim == param.output_dim: + shard_size, shard_offset = \ + param.adjust_shard_indexes_for_packing( + shard_size=shard_size, shard_offset=shard_offset) + + loaded_weight_shard = loaded_weight.narrow(param.output_dim, + shard_offset, + shard_size) + self.weight_loader_v2(param, loaded_weight_shard, shard_id) + + def weight_loader_v2(self, + param: BasevLLMParameter, + loaded_weight: torch.Tensor, + loaded_shard_id: Optional[str] = None): + if loaded_shard_id is None: # special case for certain models + if isinstance(param, PerTensorScaleParameter): + param.load_qkv_weight(loaded_weight=loaded_weight, + shard_id=0, + tp_rank=self.tp_rank) + return + elif type(param) in (RowvLLMParameter, BasevLLMParameter): + param.load_qkv_weight(loaded_weight=loaded_weight, + tp_rank=self.tp_rank) + return + # TODO: @dsikka - move to parameter.py + self._load_fused_module_from_checkpoint(param, loaded_weight) + return + + assert loaded_shard_id in ["q", "k", "v"] + + shard_offset = self._get_shard_offset_mapping(loaded_shard_id) + shard_size = self._get_shard_size_mapping(loaded_shard_id) + + # Note(simon): This is needed for Qwen3's fp8 quantization. + if isinstance(param, BlockQuantScaleParameter): + assert self.quant_method is not None + assert hasattr(self.quant_method, "quant_config") + weight_block_size = self.quant_method.quant_config.weight_block_size + block_n, _ = weight_block_size[0], weight_block_size[1] + shard_offset = (shard_offset + block_n - 1) // block_n + shard_size = (shard_size + block_n - 1) // block_n + + param.load_qkv_weight(loaded_weight=loaded_weight, + num_heads=self.num_kv_head_replicas, + shard_id=loaded_shard_id, + shard_offset=shard_offset, + shard_size=shard_size, + tp_rank=self.tp_rank) + + def weight_loader(self, + param: Parameter, + loaded_weight: torch.Tensor, + loaded_shard_id: Optional[str] = None): + + # Special case for GGUF + # initialize GGUF param after we know the quantize type + is_gguf_weight = getattr(param, "is_gguf_weight", False) + is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False) + if is_gguf_weight_type: + idx_map = {"q": 0, "k": 1, "v": 2} + if loaded_shard_id is not None: + param.data[idx_map[loaded_shard_id]].copy_(loaded_weight) + param.shard_weight_type[loaded_shard_id] = loaded_weight.item() + else: + param.shard_weight_type = { + k: loaded_weight.item() + for k in idx_map + } + return + + if is_gguf_weight: + output_dim = getattr(param, "output_dim", None) + shard_size = loaded_weight.size(output_dim) // self.tp_size + start_idx = self.tp_rank * shard_size + + if loaded_shard_id is not None: + loaded_weight = loaded_weight.narrow(output_dim, start_idx, + shard_size) + param.shard_id.append(loaded_shard_id) + param.shard_id_map[loaded_shard_id] = len(param.data_container) + param.data_container.append(loaded_weight) + return + + param_data = param.data + output_dim = getattr(param, "output_dim", None) + + # Special case for per-tensor scales in fused case. + needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False) + + if loaded_shard_id is None: + # Loaded weight is already fused on disk (qkv). + # (e.g., Phi-3's qkv_proj). + if output_dim is None: + if needs_scalar_to_array: + param_data, loaded_weight = adjust_scalar_to_fused_array( + param_data, loaded_weight, 0) + + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + return + shard_offsets = [ + # (shard_id, shard_offset, shard_size) + ("q", 0, self.total_num_heads * self.head_size), + ("k", self.total_num_heads * self.head_size, + self.total_num_kv_heads * self.head_size), + ("v", (self.total_num_heads + self.total_num_kv_heads) * + self.head_size, self.total_num_kv_heads * self.head_size), + ] + use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", + False) + + packed_dim = getattr(param, "packed_dim", None) + for shard_id, shard_offset, shard_size in shard_offsets: + # Special case for Quantized Weights. + # If quantized, we need to adjust the offset and size to account + # for the packing. + if packed_dim == output_dim: + shard_size = shard_size // param.packed_factor + shard_offset = shard_offset // param.packed_factor + + # Special case for Marlin. + shard_size, shard_offset = adjust_marlin_shard( + param, shard_size, shard_offset) + + if use_bitsandbytes_4bit: + orig_qkv_offsets = { + "q": (0, self.total_num_heads * self.head_size), + "k": (self.total_num_heads * self.head_size, + self.total_num_kv_heads * self.head_size), + "v": + ((self.total_num_heads + self.total_num_kv_heads) * + self.head_size, + self.total_num_kv_heads * self.head_size), + "total": + ((self.total_num_heads + 2 * self.total_num_kv_heads) * + self.head_size, 0) + } + + shard_size, shard_offset = adjust_bitsandbytes_4bit_shard( + param, orig_qkv_offsets, shard_id) + + loaded_weight_shard = loaded_weight.narrow( + output_dim, shard_offset, shard_size) + self.weight_loader(param, loaded_weight_shard, shard_id) + return + + assert loaded_shard_id in ["q", "k", "v"] + + # If output dim is defined, use the default loading process. + if output_dim is not None: + if loaded_shard_id == "q": + shard_offset = 0 + shard_size = self.num_heads * self.head_size + elif loaded_shard_id == "k": + shard_offset = self.num_heads * self.head_size + shard_size = self.num_kv_heads * self.head_size + elif loaded_shard_id == "v": + shard_offset = (self.num_heads + + self.num_kv_heads) * self.head_size + shard_size = self.num_kv_heads * self.head_size + # Special case for Quantized Weights. + # If quantized, we need to adjust the offset and size to account + # for the packing. + packed_dim = getattr(param, "packed_dim", None) + if packed_dim == output_dim: + shard_size = shard_size // param.packed_factor + shard_offset = shard_offset // param.packed_factor + + # Special case for Marlin. + shard_size, shard_offset = adjust_marlin_shard( + param, shard_size, shard_offset) + + use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", + False) + is_sharded_weight = getattr(param, "is_sharded_weight", False) + # bitsandbytes loads the weights of the specific portion + # no need to narrow + is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit + + if use_bitsandbytes_4bit: + orig_qkv_offsets = { + "q": (0, self.num_heads * self.head_size), + "k": (self.num_heads * self.head_size, + self.num_kv_heads * self.head_size), + "v": + ((self.num_heads + self.num_kv_heads) * self.head_size, + self.num_kv_heads * self.head_size), + "total": + ((self.num_heads + 2 * self.num_kv_heads) * self.head_size, + 0) + } + shard_size, shard_offset = adjust_bitsandbytes_4bit_shard( + param, orig_qkv_offsets, loaded_shard_id) + + param_data = param_data.narrow(output_dim, shard_offset, + shard_size) + if loaded_shard_id == "q": + shard_id = self.tp_rank + else: + shard_id = self.tp_rank // self.num_kv_head_replicas + start_idx = shard_id * shard_size + + if not is_sharded_weight: + loaded_weight = loaded_weight.narrow(output_dim, start_idx, + shard_size) + + # Special case for per-tensor scales in fused case. + elif needs_scalar_to_array: + param_data, loaded_weight = adjust_scalar_to_fused_array( + param_data, loaded_weight, loaded_shard_id) + else: + ignore_warning = getattr(param, "ignore_warning", False) + if not ignore_warning: + logger.warning( + "Loading a weight without `output_dim` attribute in " + "QKVParallelLinear, assume the weight is the same " + "for all partitions.") + + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + + +@CustomOp.register("row_parallel_linear_kunlun") +class RowParallelLinear(LinearBase): + """Linear layer with row parallelism. + + The linear layer is defined as Y = XA + b. A is parallelized along + its first dimension and X along its second dimension as: + - - + | A_1 | + | . | + A = | . | X = [X_1, ..., X_p] + | . | + | A_p | + - - + Arguments: + input_size: first dimension of matrix A. + output_size: second dimension of matrix A. + bias: If true, add bias. Note that bias is not parallelized. + input_is_parallel: If true, we assume that the input is already + split across the GPUs and we do not split + again. + skip_bias_add: This was added to enable performance optimization where + bias can be fused with other element-wise operations. + We skip adding bias but instead return it. + params_dtype: Data type for the parameters. + reduce_results: If true, call all-reduce on output and make Y available + to all GPUs, otherwise, every GPU will have its output + which is Y = X_iA_i + quant_config: Quantization configure. + prefix: The name of the layer in the state dict, including all parents + (e.g. model.layers.0.down_proj) + return_bias: If true, return bias together with outputs in forward pass. + disable_tp: If true, weights matrix won't be sharded through tp rank. + """ + + def __init__( + self, + input_size: int, + output_size: int, + bias: bool = True, + input_is_parallel: bool = True, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + reduce_results: bool = True, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + *, + return_bias: bool = True, + disable_tp: bool = False, + ): + # Divide the weight matrix along the first dimension. + self.tp_rank = (get_tensor_model_parallel_rank() + if not disable_tp else 0) + self.tp_size = (get_tensor_model_parallel_world_size() + if not disable_tp else 1) + self.input_size_per_partition = divide(input_size, self.tp_size) + self.output_size_per_partition = output_size + self.output_partition_sizes = [output_size] + + super().__init__(input_size, + output_size, + skip_bias_add, + params_dtype, + quant_config, + prefix, + return_bias=return_bias, + disable_tp=disable_tp) + + self.input_is_parallel = input_is_parallel + self.reduce_results = reduce_results + + assert self.quant_method is not None + self.quant_method.create_weights( + layer=self, + input_size_per_partition=self.input_size_per_partition, + output_partition_sizes=self.output_partition_sizes, + input_size=self.input_size, + output_size=self.output_size, + params_dtype=self.params_dtype, + weight_loader=( + self.weight_loader_v2 if self.quant_method.__class__.__name__ + in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader)) + if not reduce_results and (bias and not skip_bias_add): + raise ValueError("When not reduce the results, adding bias to the " + "results can lead to incorrect results") + + if bias: + self.bias = Parameter( + torch.empty(self.output_size, dtype=params_dtype)) + set_weight_attrs(self.bias, { + "output_dim": 0, + "weight_loader": self.weight_loader, + }) + else: + self.register_parameter("bias", None) + self.update_param_tp_status() + + def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): + input_dim = getattr(param, "input_dim", None) + use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + is_sharded_weight = getattr(param, "is_sharded_weight", False) + # bitsandbytes loads the weights of the specific portion + # no need to narrow + is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit + + # Special case for GGUF + is_gguf_weight = getattr(param, "is_gguf_weight", False) + is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False) + if is_gguf_weight_type: + param.weight_type = loaded_weight.item() + + # Materialize GGUF UninitializedParameter + if is_gguf_weight and isinstance(param, UninitializedParameter): + weight_shape = list(loaded_weight.shape) + if input_dim: + weight_shape[input_dim] = (weight_shape[input_dim] // + self.tp_size) + param.materialize(tuple(weight_shape), dtype=loaded_weight.dtype) + + param_data = param.data + if input_dim is not None and not is_sharded_weight: + shard_size = param_data.shape[input_dim] + start_idx = self.tp_rank * shard_size + loaded_weight = loaded_weight.narrow(input_dim, start_idx, + shard_size) + + # Special case for loading scales off disk, which often do not + # have a shape (such as in the case of AutoFP8). + if len(loaded_weight.shape) == 0: + loaded_weight = loaded_weight.reshape(1) + + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + + def weight_loader_v2(self, param: BasevLLMParameter, + loaded_weight: torch.Tensor): + + # Special case for loading scales off disk, which often do not + # have a shape (such as in the case of AutoFP8). + if len(loaded_weight.shape) == 0: + assert loaded_weight.numel() == 1 + loaded_weight = loaded_weight.reshape(1) + + param.load_row_parallel_weight(loaded_weight=loaded_weight) + + def forward( + self, input_ + ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]: + if self.input_is_parallel: + input_parallel = input_ + else: + splitted_input = split_tensor_along_last_dim( + input_, num_partitions=self.tp_size) + input_parallel = splitted_input[self.tp_rank].contiguous() + + # Matrix multiply. + assert self.quant_method is not None + # Only fuse bias add into GEMM for rank 0 (this ensures that + # bias will not get added more than once in TP>1 case) + bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias + output_parallel = self.quant_method.apply(self, + input_parallel, + bias=bias_) + if self.reduce_results and self.tp_size > 1: + output = tensor_model_parallel_all_reduce(output_parallel) + else: + output = output_parallel + + output_bias = self.bias if self.skip_bias_add else None + + if not self.return_bias: + return output + return output, output_bias + + def extra_repr(self) -> str: + s = f"in_features={self.input_size_per_partition}" + s += f", output_features={self.output_size}" + s += f", bias={self.bias is not None}" + s += f", tp_size={self.tp_size}" + s += f", reduce_results={self.reduce_results}" + return s + + +@CustomOp.register("qkv_cross_parallel_linear_kunlun") +class QKVCrossParallelLinear(LinearBase): + """Linear layers for efficient cross-attention's QKV transformation. + + Args: + hidden_size: input hidden state size of the transformer. + head_size: size of each attention head. + total_num_heads: total number of attention query heads. + total_num_kv_heads: total number of attention key/value heads. If + None, assume total_num_kv_heads = total_num_heads. + bias: If true, add bias. + skip_bias_add: This was added to enable performance optimizations where + bias can be fused with other element-wise operations. we + skip adding bias but instead return it. + params_dtype: Data type for the parameters. + quant_config: Quantization configure. + prefix: The name of the layer in the state dict, including all parents + (e.g. model.layers.0.qkv_proj) + """ + + def __init__(self, + hidden_size: int, + head_size: int, + total_num_heads: int, + total_num_kv_heads: Optional[int] = None, + bias: bool = True, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): + # input_size and output_size are not used, just for alignment + input_size = hidden_size + output_size = (total_num_heads + (total_num_kv_heads or 0)) * head_size + super().__init__(input_size=input_size, + output_size=output_size, + skip_bias_add=skip_bias_add, + params_dtype=params_dtype, + quant_config=quant_config, + prefix=prefix) + + self.quant_config = quant_config + + # Empty placeholders for loading as a single module. + placeholder_size = 0 + assert self.quant_method is not None + self.quant_method.create_weights(self, + placeholder_size, [placeholder_size], + placeholder_size, + placeholder_size, + self.params_dtype, + weight_loader=self.weight_loader) + + # Use a dictionary to avoid submodules parameters auto-registration: + # drop-in replacement for a `QKVParallelLinear` module. + self.proj = dict() + self.proj["q_proj_decoder"] = ColumnParallelLinear( + input_size=hidden_size, + output_size=total_num_heads * head_size, + bias=bias, + quant_config=quant_config, + skip_bias_add=skip_bias_add, + params_dtype=params_dtype, + prefix=f"{prefix}.q_proj_decoder") + + self.proj["kv_proj_encoder"] = QKVParallelLinear( + hidden_size=hidden_size, + head_size=head_size, + total_num_heads=0, + total_num_kv_heads=total_num_kv_heads, + bias=bias, + quant_config=quant_config, + skip_bias_add=skip_bias_add, + params_dtype=params_dtype, + prefix=f"{prefix}.kv_proj_encoder") + + # `kv_proj_encoder.num_kv_heads` accounts for sharding with tp>1. + self.q_size = self.q_proj_decoder.output_size_per_partition + self.kv_size = self.kv_proj_encoder.num_kv_heads * head_size + + if bias: + self.bias = torch.nn.Parameter() + set_weight_attrs(self.bias, { + "output_dim": 0, + "weight_loader": self.weight_loader_v1, + }) + else: + self.bias = None + + def process_weights_after_loading(self): + for layer in self.proj.values(): + if self.quant_method is not None: + self.quant_method.process_weights_after_loading(layer) + + @property + def q_proj_decoder(self) -> ColumnParallelLinear: + layer = self.proj["q_proj_decoder"] + for name, param in self.named_parameters(): + target_param = getattr(layer, name, None) + if target_param is not None: + self.sync_weight_attrs(param, + target_param, + mode="q_proj_decoder") + return layer + + @property + def kv_proj_encoder(self) -> QKVParallelLinear: + layer = self.proj["kv_proj_encoder"] + for name, param in self.named_parameters(): + target_param = getattr(layer, name, None) + if target_param is not None: + self.sync_weight_attrs(param, + target_param, + mode="kv_proj_encoder") + return layer + + def sync_weight_attrs( + self, + src_param: nn.Parameter, + tgt_param: nn.Parameter, + mode: Literal["q_proj_decoder", "kv_proj_encoder"], + ): + missing_attrs_dict = { + k: getattr(src_param, k) + for k in (set(vars(src_param).keys()) - + set(vars(tgt_param).keys())) + } + # TODO(Isotr0py): handle bitsandbytes 8bit + use_bitsandbytes_4bit = getattr(src_param, "use_bitsandbytes_4bit", + False) + if (missing_attrs_dict and use_bitsandbytes_4bit): + q_proj_attrs, kv_proj_attrs = left_shift_bitsandbytes_4bit_shard( + missing_attrs_dict) + if mode == "q_proj_decoder": + set_weight_attrs(tgt_param, q_proj_attrs) + elif mode == "kv_proj_encoder": + set_weight_attrs(tgt_param, kv_proj_attrs) + else: + set_weight_attrs(tgt_param, missing_attrs_dict) + + def _is_same_param( + self, + src_param: torch.nn.Parameter, + map_param: torch.nn.Parameter, + ) -> bool: + """Check if two parameters are exactly pointing to same things.""" + # ignore weight_loader because it's always different + key_to_ignore = ["weight_loader", "_weight_loader"] + has_same_type_name = type(src_param) is type(map_param) + src_param_attrs = { + k: v + for k, v in src_param.__dict__.items() if k not in key_to_ignore + } + map_param_attrs = { + k: v + for k, v in map_param.__dict__.items() if k not in key_to_ignore + } + has_same_attrs = src_param_attrs == map_param_attrs + return has_same_type_name and has_same_attrs + + def select_proj_params( + self, + layer: nn.Module, + param: nn.Parameter, + ) -> nn.Parameter: + """ + Given the placeholder param, + return the corresponding param in the proj layers. + """ + target_param_list = [ + v for _, v in layer.named_parameters() + if self._is_same_param(param, v) + ] + assert len(target_param_list) == 1 + target_param = target_param_list[0] + return target_param + + def forward( # type: ignore[override] + self, + decoder_hidden_states: torch.Tensor, + encoder_hidden_states: torch.Tensor, + ) -> tuple[torch.Tensor, ...]: + q, _ = self.q_proj_decoder(decoder_hidden_states) + if encoder_hidden_states is None: + # Encoder KV already cached. + k = None + v = None + else: + # Prefill phase, encoder KV cached here. + kv_enc, _ = self.kv_proj_encoder(encoder_hidden_states) + # Split kv in half + k, v = kv_enc.split(self.kv_size, dim=-1) + return q, k, v + + def weight_loader_v1(self, + param: torch.nn.Parameter, + loaded_weight: torch.Tensor, + loaded_shard_id: Optional[str] = None): + # just like all other parameters, does not yet + # support loading bias with weight_loader_v2 + layer = (self.q_proj_decoder + if loaded_shard_id == "q" else self.kv_proj_encoder) + target_param = self.select_proj_params(layer, param) + shard_id_args = (loaded_shard_id, ) if loaded_shard_id != "q" else () + layer.weight_loader(target_param, loaded_weight, *shard_id_args) + + def weight_loader(self, + param: torch.nn.Parameter, + loaded_weight: torch.Tensor, + loaded_shard_id: Optional[str] = None): + layer = (self.q_proj_decoder + if loaded_shard_id == "q" else self.kv_proj_encoder) + target_param = self.select_proj_params(layer, param) + shard_id_args = (loaded_shard_id, ) if loaded_shard_id != "q" else () + if self.quant_method.__class__.__name__ in WEIGHT_LOADER_V2_SUPPORTED: + layer.weight_loader_v2(target_param, loaded_weight, *shard_id_args) + else: + layer.weight_loader(target_param, loaded_weight, *shard_id_args) + + def extra_repr(self) -> str: + s = f"in_features={self.input_size}" + s += f", q_size={self.q_size}" + s += f", kv_size={self.kv_size}" + s += f", bias={self.bias is not None}" + s += f", tp_size={get_tensor_model_parallel_world_size()}" + s += ", gather_output=False" + return s diff --git a/vllm_kunlun/lora/punica_wrapper/__init__.py b/vllm_kunlun/ops/mamba/__init__.py similarity index 100% rename from vllm_kunlun/lora/punica_wrapper/__init__.py rename to vllm_kunlun/ops/mamba/__init__.py diff --git a/vllm_kunlun/ops/mamba/causal_conv1d.py b/vllm_kunlun/ops/mamba/causal_conv1d.py new file mode 100644 index 0000000..1e48e36 --- /dev/null +++ b/vllm_kunlun/ops/mamba/causal_conv1d.py @@ -0,0 +1,1217 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright (c) 2024, Tri Dao. +# Adapted from https://github.com/Dao-AILab/causal-conv1d/blob/main/causal_conv1d/causal_conv1d_interface.py + +from typing import Optional, Union + +import numpy as np +import torch + +from vllm.attention.backends.utils import PAD_SLOT_ID +from vllm.triton_utils import tl, triton + + +@triton.jit() +def _causal_conv1d_fwd_kernel( # continuous batching + # Pointers to matrices + x_ptr, # (dim, cu_seqlen) holding `batch` of actual sequences + padded sequences + w_ptr, # (dim, width) + bias_ptr, + initial_states_ptr, # conv_states_ptr + cache_indices_ptr, # conv_state_indices_ptr + has_initial_states_ptr, + query_start_loc_ptr, + batch_ptr, + token_chunk_offset_ptr, + o_ptr, # (dim, seqlen) - actually pointing to x_ptr + # Matrix dimensions + batch: tl.int32, # actually padded_batch + dim: tl.constexpr, + seqlen: tl.int32, # cu_seqlen + num_cache_lines: tl.constexpr, # added to support vLLM larger cache lines + # Strides + stride_x_seq: tl.constexpr, # stride to get to next sequence, + stride_x_dim: tl.constexpr, # stride to get to next feature-value, + stride_x_token: tl. + constexpr, # stride to get to next token (same feature-index, same sequence-index) + stride_w_dim: tl.constexpr, # stride to get to next dim-axis value + stride_w_width: tl.constexpr, # stride to get to next width-axis value + stride_istate_seq: tl.constexpr, + stride_istate_dim: tl.constexpr, + stride_istate_token: tl.constexpr, + stride_o_seq: tl.constexpr, + stride_o_dim: tl.constexpr, + stride_o_token: tl.constexpr, + # others + pad_slot_id: tl.constexpr, + # Meta-parameters + HAS_BIAS: tl.constexpr, + KERNEL_WIDTH: tl.constexpr, + SILU_ACTIVATION: tl.constexpr, + HAS_INITIAL_STATES: tl.constexpr, + HAS_CACHE: tl.constexpr, + IS_CONTINUOUS_BATCHING: tl.constexpr, + USE_PAD_SLOT: tl.constexpr, + NP2_STATELEN: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + +): + conv_states_ptr = initial_states_ptr + conv_state_indices_ptr = cache_indices_ptr + stride_conv_state_seq = stride_istate_seq + stride_conv_state_dim = stride_istate_dim + stride_conv_state_tok = stride_istate_token + state_len = KERNEL_WIDTH - 1 # can be passed via argument if it's not the same as this value + + # one program handles one chunk in a single sequence + # rather than mixing sequences - to make updating initial_states across sequences efficiently + + # single-sequence id + idx_seq = tl.load(batch_ptr + tl.program_id(0)) + chunk_offset = tl.load(token_chunk_offset_ptr + tl.program_id(0)) + + # BLOCK_N elements along the feature-dimension (channel) + idx_feats = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N) + + if idx_seq == pad_slot_id: + return + + sequence_start_index = tl.load(query_start_loc_ptr + idx_seq) + sequence_end_index = tl.load(query_start_loc_ptr + idx_seq + 1) + # find the actual sequence length + seqlen = sequence_end_index - sequence_start_index + + token_offset = BLOCK_M * chunk_offset + segment_len = min(BLOCK_M, seqlen - token_offset) + + # base of the sequence + x_base = x_ptr + sequence_start_index * stride_x_token + idx_feats * stride_x_dim # [BLOCK_N,] + + if IS_CONTINUOUS_BATCHING: + # cache_idx + conv_state_batch_coord = tl.load(conv_state_indices_ptr + idx_seq).to( + tl.int64) + else: + # cache_idx + conv_state_batch_coord = idx_seq + if USE_PAD_SLOT: # noqa + if conv_state_batch_coord == pad_slot_id: + # not processing as this is not the actual sequence + return + conv_states_base = (conv_states_ptr + + (conv_state_batch_coord * stride_conv_state_seq) + + (idx_feats * stride_conv_state_dim)) # [BLOCK_N,] + + w_base = w_ptr + (idx_feats * stride_w_dim) # [BLOCK_N,] + + # Does 2 things: + # 1. READ prior-block init-state data - [done by every Triton programs] + # 2. update conv_state with new data [only by the Triton program handles chunk_offset=0] + if chunk_offset == 0: + # read from conv_states + load_init_state = False + if HAS_INITIAL_STATES: # the new HAS_INITIAL_STATES + load_init_state = tl.load(has_initial_states_ptr + idx_seq).to( + tl.int1) + if load_init_state: + # load from conv_states + prior_tokens = conv_states_base + (state_len - + 1) * stride_conv_state_tok + mask_w = idx_feats < dim + if KERNEL_WIDTH == 2: + conv_states_ptrs = prior_tokens # [BLOCK_N] + col0 = tl.load(conv_states_ptrs, mask_w, 0.0) + if KERNEL_WIDTH == 3: + conv_states_ptrs = prior_tokens # [BLOCK_N] + col1 = tl.load(conv_states_ptrs, mask_w, 0.0) + conv_states_ptrs = prior_tokens - 1 * stride_conv_state_tok # [BLOCK_N] + col0 = tl.load(conv_states_ptrs, mask_w, 0.0) + if KERNEL_WIDTH == 4: + conv_states_ptrs = prior_tokens # [BLOCK_N] + col2 = tl.load(conv_states_ptrs, mask_w, 0.0) + conv_states_ptrs = prior_tokens - 1 * stride_conv_state_tok # [BLOCK_N] + col1 = tl.load(conv_states_ptrs, mask_w, 0.0) + conv_states_ptrs = prior_tokens - 2 * stride_conv_state_tok # [BLOCK_N] + col0 = tl.load(conv_states_ptrs, mask_w, 0.0) + if KERNEL_WIDTH == 5: + conv_states_ptrs = prior_tokens # [BLOCK_N] + col3 = tl.load(conv_states_ptrs, mask_w, 0.0) + conv_states_ptrs = prior_tokens - 1 * stride_conv_state_tok # [BLOCK_N] + col2 = tl.load(conv_states_ptrs, mask_w, 0.0) + conv_states_ptrs = prior_tokens - 2 * stride_conv_state_tok # [BLOCK_N] + col1 = tl.load(conv_states_ptrs, mask_w, 0.0) + conv_states_ptrs = prior_tokens - 3 * stride_conv_state_tok # [BLOCK_N] + col0 = tl.load(conv_states_ptrs, mask_w, 0.0) + else: + # prior-tokens are zeros + if KERNEL_WIDTH >= 2: # STRATEGY1 + # first chunk and does not have prior-token, so just set to 0 + col0 = tl.zeros((BLOCK_N, ), dtype=x_ptr.dtype.element_ty) + if KERNEL_WIDTH >= 3: # STRATEGY1 + col1 = tl.zeros((BLOCK_N, ), dtype=x_ptr.dtype.element_ty) + if KERNEL_WIDTH >= 4: # STRATEGY1 + col2 = tl.zeros((BLOCK_N, ), dtype=x_ptr.dtype.element_ty) + if KERNEL_WIDTH >= 5: # STRATEGY1 + col3 = tl.zeros((BLOCK_N, ), dtype=x_ptr.dtype.element_ty) + + # STEP 2: + # here prepare data for updating conv_state + if state_len <= seqlen: # SMALL_CACHE=True (only move part of 'x' into conv_state cache) + # just read from 'x' + # copy 'x' data to conv_state + # load only 'x' data (and set 0 before 'x' if seqlen < state_len) + idx_tokens_last = (seqlen - state_len) + tl.arange( + 0, NP2_STATELEN) # [BLOCK_M] + x_ptrs = x_ptr + ( + (sequence_start_index + idx_tokens_last) * + stride_x_token)[:, None] + ( + idx_feats * stride_x_dim)[None, :] # [BLOCK_M,BLOCK_N,] + mask_x = ((idx_tokens_last >= 0)[:, None] & + (idx_tokens_last < seqlen)[:, None] & + (idx_feats < dim)[None, :] + ) # token-index # token-index # feature-index + loaded_x = tl.load(x_ptrs, mask_x, 0.0) + new_conv_state = tl.load(x_ptrs, mask_x, 0.0) + idx_tokens_conv = tl.arange(0, NP2_STATELEN) # [BLOCK_M] + conv_states_ptrs_target = conv_states_base[None, :] + ( + idx_tokens_conv * stride_conv_state_tok)[:, None] + + mask = (idx_tokens_conv < state_len)[:, None] & (idx_feats + < dim)[None, :] + # tl.debug_barrier() # NOTE: use this due to bug in Triton compiler + tl.store(conv_states_ptrs_target, new_conv_state, mask) + + else: + if load_init_state: + # update conv_state by shifting left, i.e. take last few cols from conv_state + cols from 'x' + idx_tokens_conv = tl.arange(0, NP2_STATELEN) # [BLOCK_M] + + conv_states_ptrs_source = ( + conv_states_ptr + + (conv_state_batch_coord * stride_conv_state_seq) + + (idx_feats * stride_conv_state_dim)[None, :] + + ((idx_tokens_conv + seqlen) * stride_conv_state_tok)[:, + None] + ) # [BLOCK_M, BLOCK_N] + mask = ((conv_state_batch_coord < num_cache_lines) + & ((idx_tokens_conv + seqlen) < state_len)[:, None] + & (idx_feats < dim)[None, :]) + conv_state = tl.load(conv_states_ptrs_source, mask, other=0.0) + + VAL = state_len - seqlen + + x_ptrs = x_base[None, :] + ( + (idx_tokens_conv - VAL) * + stride_x_token)[:, None] # [BLOCK_M, BLOCK_N] + + mask_x = ((idx_tokens_conv - VAL >= 0)[:, None] & + (idx_tokens_conv - VAL < seqlen)[:, None] & + (idx_feats < dim)[None, :] + ) # token-index # token-index # feature-index + loaded_x = tl.load(x_ptrs, mask_x, 0.0) + + # tl.debug_barrier( + # ) # need this due to the bug in tl.where not enforcing this when data is the result of another tl.load + new_conv_state = tl.where( + mask, conv_state, loaded_x + ) # BUG in 'tl.where' which requires a barrier before this + conv_states_ptrs_target = conv_states_base + ( + idx_tokens_conv * + stride_conv_state_tok)[:, None] # [BLOCK_M, BLOCK_N] + mask = (idx_tokens_conv + < state_len)[:, None] & (idx_feats < dim)[None, :] + tl.store(conv_states_ptrs_target, new_conv_state, mask) + else: # load_init_state == False + # update conv_state by shifting left, BUT + # set cols prior to 'x' as zeros + cols from 'x' + idx_tokens_conv = tl.arange(0, NP2_STATELEN) # [BLOCK_M] + + VAL = state_len - seqlen + + x_ptrs = x_base[None, :] + ( + (idx_tokens_conv - VAL) * + stride_x_token)[:, None] # [BLOCK_M, BLOCK_N] + + mask_x = ((idx_tokens_conv - VAL >= 0)[:, None] & + (idx_tokens_conv - VAL < seqlen)[:, None] & + (idx_feats < dim)[None, :] + ) # token-index # token-index # feature-index + new_conv_state = tl.load(x_ptrs, mask_x, 0.0) + + conv_states_ptrs_target = conv_states_base + ( + idx_tokens_conv * + stride_conv_state_tok)[:, None] # [BLOCK_M, BLOCK_N] + mask = (idx_tokens_conv + < state_len)[:, None] & (idx_feats < dim)[None, :] + tl.store(conv_states_ptrs_target, new_conv_state, mask) + + else: # chunk_offset > 0 + # read prior-token data from `x` + load_init_state = True + prior_tokens = x_base + (token_offset - 1) * stride_x_token + mask_w = idx_feats < dim + if KERNEL_WIDTH == 2: + conv_states_ptrs = prior_tokens # [BLOCK_N] + col0 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier='.ca') + if KERNEL_WIDTH == 3: + conv_states_ptrs = prior_tokens # [BLOCK_N] + col1 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier='.ca') + conv_states_ptrs = prior_tokens - 1 * stride_x_token # [BLOCK_N] + col0 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier='.ca') + if KERNEL_WIDTH == 4: + conv_states_ptrs = prior_tokens # [BLOCK_N] + col2 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier='.ca') + conv_states_ptrs = prior_tokens - 1 * stride_x_token # [BLOCK_N] + col1 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier='.ca') + conv_states_ptrs = prior_tokens - 2 * stride_x_token # [BLOCK_N] + col0 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier='.ca') + if KERNEL_WIDTH == 5: + # ruff: noqa: F841 + conv_states_ptrs = prior_tokens # [BLOCK_N] + col3 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier='.ca') + conv_states_ptrs = prior_tokens - 1 * stride_x_token # [BLOCK_N] + col2 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier='.ca') + conv_states_ptrs = prior_tokens - 2 * stride_x_token # [BLOCK_N] + col1 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier='.ca') + conv_states_ptrs = prior_tokens - 3 * stride_x_token # [BLOCK_N] + col0 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier='.ca') + + if HAS_BIAS: + bias = bias_ptr + idx_feats + mask_bias = idx_feats < dim + acc_preload = tl.load(bias, mask=mask_bias, + other=0.0).to(tl.float32) # [BLOCK_N] + else: + acc_preload = tl.zeros((BLOCK_N, ), dtype=tl.float32) + + x_base_1d = x_base + token_offset * stride_x_token # starting of chunk + + # PRE-LOAD WEIGHTS + mask_w = idx_feats < dim + if KERNEL_WIDTH >= 2: + w_ptrs = w_base + (0 * stride_w_width) # [BLOCK_N] tensor + w_col0 = tl.load(w_ptrs, mask_w, other=0.0) + w_ptrs = w_base + (1 * stride_w_width) # [BLOCK_N] tensor + w_col1 = tl.load(w_ptrs, mask_w, other=0.0) + if KERNEL_WIDTH >= 3: + w_ptrs = w_base + (2 * stride_w_width) # [BLOCK_N] tensor + w_col2 = tl.load(w_ptrs, mask_w, other=0.0) + if KERNEL_WIDTH >= 4: + w_ptrs = w_base + (3 * stride_w_width) # [BLOCK_N] tensor + w_col3 = tl.load(w_ptrs, mask_w, other=0.0) + mask_x_1d = idx_feats < dim + for idx_token in range(segment_len): + acc = acc_preload + + matrix_w = w_col0 + matrix_x = col0 + for j in tl.static_range(KERNEL_WIDTH): + + if KERNEL_WIDTH == 2: + if j == 1: # KERNEL_WIDTH-1: + matrix_w = w_col1 + x_ptrs_1d = x_base_1d + idx_token * stride_x_token # [BLOCK_N] + matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d) + elif KERNEL_WIDTH == 3: + if j == 1: + matrix_w = w_col1 + matrix_x = col1 + elif j == 2: + matrix_w = w_col2 + x_ptrs_1d = x_base_1d + idx_token * stride_x_token # [BLOCK_N] + matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d) + elif KERNEL_WIDTH == 4: + if j == 1: + matrix_w = w_col1 + matrix_x = col1 + elif j == 2: + matrix_w = w_col2 + matrix_x = col2 + elif j == 3: + matrix_w = w_col3 + x_ptrs_1d = x_base_1d + idx_token * stride_x_token # [BLOCK_N] + matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d) + + acc += matrix_x * matrix_w # [BLOCK_N] + + if KERNEL_WIDTH == 2: + col0 = matrix_x + elif KERNEL_WIDTH == 3: + col0 = col1 + col1 = matrix_x + elif KERNEL_WIDTH == 4: + col0 = col1 + col1 = col2 + col2 = matrix_x + + if SILU_ACTIVATION: + acc = acc / (1 + tl.exp(-acc)) + mask_1d = (idx_token < segment_len) & ( + idx_feats < dim) # token-index # feature-index + o_ptrs = o_ptr + (sequence_start_index + token_offset + idx_token + ) * stride_o_token + (idx_feats * stride_o_dim) + + tl.store(o_ptrs, acc, mask=mask_1d) + + +def causal_conv1d_fn( + x: torch.Tensor, + weight: torch.Tensor, + bias: Union[torch.Tensor, None], + conv_states: torch.Tensor, + query_start_loc: torch.Tensor, + cache_indices: Optional[torch.Tensor] = None, + has_initial_state: Optional[torch.Tensor] = None, + activation: Optional[str] = "silu", + pad_slot_id: int = PAD_SLOT_ID, + metadata=None, + validate_data=False, +): + """support varlen + continuous batching when x is 2D tensor + + x: (dim,cu_seq_len) + cu_seq_len = total tokens of all seqs in that batch + sequences are concatenated from left to right for varlen + weight: (dim, width) + conv_states: (...,dim,width - 1) itype + updated inplace if provided + [it use `cache_indices` to get the index to the cache of conv_state for that sequence + + conv_state[cache_indices[i]] for seq-i - to be used as initial_state when has_initial_state[i] = True + and after that conv_state[cache_indices[i]] need to be shift-left and updated with values from 'x' + ] + query_start_loc: (batch + 1) int32 + The cumulative sequence lengths of the sequences in + the batch, used to index into sequence. prepended by 0. + if + x = [5, 1, 1, 1] <- continuous batching (batch=4) + then + query_start_loc = [0, 5, 6, 7, 8] <- the starting index of the next sequence; while the last value is + the ending index of the last sequence + [length(query_start_loc)-1 == batch] + for example: query_start_loc = torch.Tensor([0,10,16,17]), + x.shape=(dim,17) + cache_indices: (batch) int32 + indicates the corresponding state index, + like so: conv_state = conv_states[cache_indices[batch_id]] + has_initial_state: (batch) bool + indicates whether should the kernel take the current state as initial + state for the calculations + [single boolean for each sequence in the batch: True or False] + bias: (dim,) + activation: either None or "silu" or "swish" or True + pad_slot_id: int + if cache_indices is passed, lets the kernel identify padded + entries that will not be processed, + for example: cache_indices = [pad_slot_id, 1, 20, pad_slot_id] + in this case, the kernel will not process entries at + indices 0 and 3 + + out: same shape as `x` + """ + if isinstance(activation, bool) and activation: + activation = "silu" + + args = None + # Store original dtype to cast back at the end + original_x_dtype = x.dtype + x = x.to(conv_states.dtype) + out = torch.empty_like(x) + if metadata is not None: + nums_dict = metadata.nums_dict + args = nums_dict + batch_ptr = metadata.batch_ptr + token_chunk_offset_ptr = metadata.token_chunk_offset_ptr + else: + seqlens = query_start_loc.diff().to('cpu') + args = seqlens + MAX_NUM_PROGRAMS = 1024 + + batch_ptr = torch.full( + (MAX_NUM_PROGRAMS, ), + PAD_SLOT_ID, + dtype=torch.int32, + device=x.device + ) # tracking which seq-idx the Triton program is handling + token_chunk_offset_ptr = torch.full( + (MAX_NUM_PROGRAMS, ), + PAD_SLOT_ID, + dtype=torch.int32, + device=x.device + ) # tracking BLOCK_M-based index in the sequence the Triton program is handling + + is_channel_last = (x.stride(0) == 1) & (x.stride(1) > 1) + dim, cu_seqlen = x.shape + _, width = weight.shape + state_len = width - 1 + np2_statelen = triton.next_power_of_2(state_len) + + padded_batch = query_start_loc.size(0) - 1 + stride_x_seq = 0 + stride_x_dim = x.stride(0) + stride_x_token = x.stride(1) + stride_w_dim = weight.stride(0) + stride_w_width = weight.stride(1) + stride_istate_seq = 0 + stride_istate_dim = 0 + stride_istate_token = 0 + num_cache_lines = 0 + if conv_states is not None: + # extensions to support vLLM: + # 1. conv_states is used to replaced initial_states + # 2. conv_states serve as a cache with num cache lines can be larger than batch size + # 3. mapping from sequence x[idx] to a cache line at index as specified via cache_indices[idx] + # 4. computation can be skipped if cache_indices[idx] == pad_slot_id + num_cache_lines = conv_states.size(0) + assert (num_cache_lines == conv_states.shape[0] + and dim == conv_states.shape[1] + and width - 1 <= conv_states.shape[2]) + stride_istate_seq = conv_states.stride(0) + stride_istate_dim = conv_states.stride(1) + stride_istate_token = conv_states.stride(2) + assert stride_istate_dim == 1 + if out.dim() == 2: + stride_o_seq = 0 + stride_o_dim = out.stride(0) + stride_o_token = out.stride(1) + else: + stride_o_seq = out.stride(0) + stride_o_dim = out.stride(1) + stride_o_token = out.stride(2) + stride_cache_indices = cache_indices.stride( + 0) if cache_indices is not None else 0 + + if validate_data: + assert x.dim() == 2 + assert query_start_loc is not None + assert query_start_loc.dim() == 1 + assert x.stride(0) == 1 or x.stride(1) == 1 + if bias is not None: + assert bias.dim() == 1 + assert dim == bias.size(0) + if cache_indices is not None: + assert cache_indices.dim() == 1 + assert padded_batch == cache_indices.size(0) + if has_initial_state is not None: + assert has_initial_state.size() == (padded_batch, ) + assert conv_states is not None, "ERROR: `has_initial_state` is used, which needs also `conv_states`" + assert weight.stride(1) == 1 + assert (dim, width) == weight.shape + assert is_channel_last, "Need to run in channel-last layout" + + if metadata is None: + + def num_program(META, seqlens): + tot = 0 + + mlist = [] + offsetlist = [] # type: ignore + + nums = -(-seqlens // META["BLOCK_M"]) + + tot = nums.sum().item() + mlist = np.repeat(np.arange(len(nums)), nums) + for idx, num in enumerate(nums): + offsetlist.extend( + range(num) + ) # chunk-idx if a sequence is split into multiple chunks + + if META["batch_ptr"].nelement() < len(mlist): + newlen = len(mlist) + 1 + META["batch_ptr"].resize_(newlen).fill_(PAD_SLOT_ID) + META["token_chunk_offset_ptr"].resize_(newlen).fill_( + PAD_SLOT_ID) + + if META["batch_ptr"].nelement() >= len(mlist): + META["batch_ptr"][0:len(mlist)].copy_( + torch.from_numpy(np.array(mlist))) + META["token_chunk_offset_ptr"][0:len(mlist)].copy_( + torch.from_numpy(np.array(offsetlist))) + + META["batch_ptr"] = META["batch_ptr"].to(META["x_ptr"].device) + META["token_chunk_offset_ptr"] = META["token_chunk_offset_ptr"].to( + META["x_ptr"].device) + return tot + else: + + def num_program(META, nums_dict): + tot = nums_dict[META["BLOCK_M"]]['tot'] + + mlist = nums_dict[META["BLOCK_M"]]['mlist'] + mlist_len = nums_dict[META["BLOCK_M"]]['mlist_len'] + + offsetlist = nums_dict[META["BLOCK_M"]]['offsetlist'] + + if nums_dict[META["BLOCK_M"]]["batch_ptr"] is not None: + META["batch_ptr"] = nums_dict[META["BLOCK_M"]]["batch_ptr"] + META["token_chunk_offset_ptr"] = nums_dict[ + META["BLOCK_M"]]["token_chunk_offset_ptr"] + else: + if META["batch_ptr"].nelement() < mlist_len: + newlen = mlist_len + 1 + META["batch_ptr"].resize_(newlen).fill_(PAD_SLOT_ID) + META["token_chunk_offset_ptr"].resize_(newlen).fill_( + PAD_SLOT_ID) + + if META["batch_ptr"].nelement() >= mlist_len: + META["batch_ptr"][0:mlist_len].copy_(mlist) + META["token_chunk_offset_ptr"][0:mlist_len].copy_( + offsetlist) + return tot + + def grid(META): + return ( + num_program(META, args), + triton.cdiv(dim, META["BLOCK_N"]), + ) + + if batch_ptr.device != x.device: + batch_ptr = batch_ptr.to(x.device) + token_chunk_offset_ptr = token_chunk_offset_ptr.to(x.device) + + _causal_conv1d_fwd_kernel[grid]( + # Pointers to matrices + x, + weight, + bias, + conv_states, + cache_indices, + has_initial_state, + query_start_loc, + batch_ptr, + token_chunk_offset_ptr, + out, + # Matrix dimensions + padded_batch, + dim, + cu_seqlen, + num_cache_lines, + # stride + stride_x_seq, + stride_x_dim, + stride_x_token, + stride_w_dim, + stride_w_width, + stride_istate_seq, + stride_istate_dim, + stride_istate_token, + stride_o_seq, + stride_o_dim, + stride_o_token, + # others + pad_slot_id, + # META + HAS_BIAS=bias is not None, + KERNEL_WIDTH=width, + SILU_ACTIVATION=activation in ["silu", "swish"], + HAS_INITIAL_STATES=has_initial_state is not None, + HAS_CACHE=conv_states is not None, + IS_CONTINUOUS_BATCHING=cache_indices is not None, + USE_PAD_SLOT=pad_slot_id is not None, + NP2_STATELEN=np2_statelen, + #launch_cooperative_grid=True + BLOCK_M=8, + BLOCK_N=256, + num_stages=2, + groups_per_cluster = np2_statelen, + isCloseUnrollControl = True, + isCloseVectorization = True, + is_use_mask_zero = True + ) + return out + +@triton.jit() +def _causal_conv1d_update_kernel_xpu( + # Pointers to matrices + x_ptr, # (batch, dim, seqlen) + w_ptr, # (dim, width) + bias_ptr, + conv_state_ptr, + cache_seqlens_ptr, # circular buffer + conv_state_indices_ptr, + num_accepted_tokens_ptr, + o_ptr, # (batch, dim, seqlen) + # Matrix dimensions + batch_id, + batch: int, + dim: tl.constexpr, + seqlen: tl.constexpr, + state_len: tl.constexpr, + num_cache_lines: tl.constexpr, # added to support vLLM larger cache lines + # Strides + stride_x_seq: tl.constexpr, + stride_x_dim: tl.constexpr, + stride_x_token: tl.constexpr, + stride_w_dim: tl.constexpr, + stride_w_width: tl.constexpr, + stride_conv_state_seq: tl.constexpr, + stride_conv_state_dim: tl.constexpr, + stride_conv_state_tok: tl.constexpr, + stride_state_indices: tl.constexpr, + stride_o_seq: tl.constexpr, + stride_o_dim: tl.constexpr, + stride_o_token: tl.constexpr, + # others + pad_slot_id: tl.constexpr, + # Meta-parameters + HAS_BIAS: tl.constexpr, + KERNEL_WIDTH: tl.constexpr, + SILU_ACTIVATION: tl.constexpr, + IS_CONTINUOUS_BATCHING: tl.constexpr, + IS_SPEC_DECODING: tl.constexpr, + NP2_STATELEN: tl.constexpr, + USE_PAD_SLOT: tl.constexpr, + BLOCK_N: tl.constexpr, +): + # ruff: noqa: E501 + idx_seq = batch_id + # idx_seq = tl.program_id(0) + if idx_seq >= batch: + return + + # [BLOCK_N,] elements along the feature-dimension (channel) + idx_feats = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N) + + if IS_CONTINUOUS_BATCHING: + # mask = idx_seq < batch + conv_state_batch_coord = tl.load(conv_state_indices_ptr + + idx_seq * stride_state_indices).to( + tl.int64) + else: + conv_state_batch_coord = idx_seq + if USE_PAD_SLOT: # noqa + if conv_state_batch_coord == pad_slot_id: + # not processing as this is not the actual sequence + return + + if IS_SPEC_DECODING: + # The rolling of conv state: + # + # Before forward, the conv_state is: + # [history1, history2, ..., historyM]. + # + # After forward, the conv_state becomes: + # [history2, ..., historyM, draft1, draft2, ..., draftN]. + # + # After acceptance, it becomes: + # + # - accept 1 tokens: [history2, ..., historyM, draft1] + # - accept 2 tokens: [history3, ..., historyM, draft1, draft2] + # - and so on. + conv_state_token_offset = (tl.load(num_accepted_tokens_ptr + idx_seq) - + 1) + else: + conv_state_token_offset = 0 + + # STEP 1: READ init_state data + conv_states_base = (conv_state_ptr + + (conv_state_batch_coord * stride_conv_state_seq) + + (idx_feats * stride_conv_state_dim)) + mask_w = idx_feats < dim + + prior_tokens = conv_states_base + conv_state_token_offset * stride_conv_state_tok + if KERNEL_WIDTH >= 2: + conv_states_ptrs = prior_tokens # [BLOCK_N] + col0 = tl.load(conv_states_ptrs, mask_w, 0.0) + if KERNEL_WIDTH >= 3: + conv_states_ptrs = prior_tokens + 1 * stride_conv_state_tok # [BLOCK_N] + col1 = tl.load(conv_states_ptrs, mask_w, 0.0) + if KERNEL_WIDTH >= 4: + conv_states_ptrs = prior_tokens + 2 * stride_conv_state_tok # [BLOCK_N] + col2 = tl.load(conv_states_ptrs, mask_w, 0.0) + if KERNEL_WIDTH == 5: + conv_states_ptrs = prior_tokens + 3 * stride_conv_state_tok # [BLOCK_N] + col3 = tl.load(conv_states_ptrs, mask_w, 0.0) + + # STEP 2: assume state_len > seqlen + idx_tokens = tl.arange(0, NP2_STATELEN) # [BLOCK_M] + + # With speculative decoding, the conv_state updates works in a sliding + # window manner, at each forward pass, the tokens are shift by 1, so we + # load since idx_tokens + 1. + conv_state_ptrs_source = ( + conv_state_ptr + (conv_state_batch_coord * stride_conv_state_seq) + + conv_state_token_offset * stride_conv_state_tok + + (idx_feats * stride_conv_state_dim)[None, :] + + ((idx_tokens + (1 if IS_SPEC_DECODING else seqlen)) * + stride_conv_state_tok)[:, None]) # [BLOCK_M, BLOCK_N] + mask = ((conv_state_batch_coord < num_cache_lines) + & ((idx_tokens + seqlen) < state_len)[:, None] + & (idx_feats < dim)[None, :]) + conv_state = tl.load(conv_state_ptrs_source, mask, other=0.0) + + VAL = state_len - seqlen + x_base = x_ptr + (idx_seq * stride_x_seq) + (idx_feats * stride_x_dim + ) # [BLOCK_N] + + x_ptrs = x_base[None, :] + ( + (idx_tokens - VAL) * stride_x_token)[:, None] # [BLOCK_M, BLOCK_N] + + mask_x = ((idx_tokens - VAL >= 0)[:, None] & + (idx_tokens - VAL < seqlen)[:, None] & (idx_feats < dim)[None, :] + ) # token-index # token-index # feature-index + loaded_x = tl.load(x_ptrs, mask_x, 0.0) + + new_conv_state = tl.where(mask, conv_state, loaded_x) + + conv_state_base = (conv_state_ptr + + (conv_state_batch_coord * stride_conv_state_seq) + + (idx_feats * stride_conv_state_dim)) # [BLOCK_N,] + conv_state_ptrs_target = conv_state_base + ( + idx_tokens * stride_conv_state_tok)[:, None] # [BLOCK_M, BLOCK_N] + mask = (idx_tokens < state_len)[:, None] & (idx_feats < dim)[None, :] + tl.store(conv_state_ptrs_target, new_conv_state, mask) + + # STEP 3: init accumulator + if HAS_BIAS: + bias = bias_ptr + idx_feats + mask_bias = idx_feats < dim + acc_preload = tl.load(bias, mask=mask_bias, + other=0.0).to(tl.float32) # [BLOCK_N] + else: + acc_preload = tl.zeros((BLOCK_N, ), dtype=tl.float32) + + # STEP 4: + # PRE-LOAD WEIGHTS + # first kernel column, configured for weights to handle BLOCK_N features in range + w_base = w_ptr + (idx_feats * stride_w_dim) # [BLOCK_N,] + mask_w = idx_feats < dim + if KERNEL_WIDTH >= 2: + w_ptrs = w_base + (0 * stride_w_width) # [BLOCK_N] tensor + w_col0 = tl.load(w_ptrs, mask_w, other=0.0) + w_ptrs = w_base + (1 * stride_w_width) # [BLOCK_N] tensor + w_col1 = tl.load(w_ptrs, mask_w, other=0.0) + if KERNEL_WIDTH >= 3: + w_ptrs = w_base + (2 * stride_w_width) # [BLOCK_N] tensor + w_col2 = tl.load(w_ptrs, mask_w, other=0.0) + if KERNEL_WIDTH >= 4: + w_ptrs = w_base + (3 * stride_w_width) # [BLOCK_N] tensor + w_col3 = tl.load(w_ptrs, mask_w, other=0.0) + + x_base_1d = x_base # starting of chunk [BLOCK_N] + mask_x_1d = idx_feats < dim + + # STEP 5: compute each token + for idx_token in tl.static_range(seqlen): + acc = acc_preload + + matrix_w = w_col0 + matrix_x = col0 + for j in tl.static_range(KERNEL_WIDTH): + if KERNEL_WIDTH == 2: + if j == 1: # KERNEL_WIDTH-1: + matrix_w = w_col1 + x_ptrs_1d = x_base_1d + idx_token * stride_x_token # [BLOCK_N] + matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d) + elif KERNEL_WIDTH == 3: + if j == 1: + matrix_w = w_col1 + matrix_x = col1 + elif j == 2: + matrix_w = w_col2 + x_ptrs_1d = x_base_1d + idx_token * stride_x_token # [BLOCK_N] + matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d) + elif KERNEL_WIDTH == 4: + if j == 1: + matrix_w = w_col1 + matrix_x = col1 + elif j == 2: + matrix_w = w_col2 + matrix_x = col2 + elif j == 3: + matrix_w = w_col3 + x_ptrs_1d = x_base_1d + idx_token * stride_x_token # [BLOCK_N] + matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d) + + acc += matrix_x * matrix_w # [BLOCK_N] + + if KERNEL_WIDTH == 2: + col0 = matrix_x + elif KERNEL_WIDTH == 3: + col0 = col1 + col1 = matrix_x + elif KERNEL_WIDTH == 4: + col0 = col1 + col1 = col2 + col2 = matrix_x + + if SILU_ACTIVATION: + acc = acc / (1 + tl.exp(-acc)) + mask_1d = (idx_token < seqlen) & (idx_feats < dim + ) # token-index # feature-index + o_ptrs = o_ptr + ( + idx_seq) * stride_o_seq + idx_token * stride_o_token + ( + idx_feats * stride_o_dim) + + tl.store(o_ptrs, acc, mask=mask_1d) + +@triton.jit() +def _causal_conv1d_update_kernel( + # Pointers to matrices + x_ptr, # (batch, dim, seqlen) + w_ptr, # (dim, width) + bias_ptr, + conv_state_ptr, + cache_seqlens_ptr, # circular buffer + conv_state_indices_ptr, + num_accepted_tokens_ptr, + o_ptr, # (batch, dim, seqlen) + # Matrix dimensions + batch: int, + dim: tl.constexpr, + seqlen: tl.constexpr, + state_len: tl.constexpr, + num_cache_lines: tl.constexpr, # added to support vLLM larger cache lines + # Strides + stride_x_seq: tl.constexpr, + stride_x_dim: tl.constexpr, + stride_x_token: tl.constexpr, + stride_w_dim: tl.constexpr, + stride_w_width: tl.constexpr, + stride_conv_state_seq: tl.constexpr, + stride_conv_state_dim: tl.constexpr, + stride_conv_state_tok: tl.constexpr, + stride_state_indices: tl.constexpr, + stride_o_seq: tl.constexpr, + stride_o_dim: tl.constexpr, + stride_o_token: tl.constexpr, + # others + pad_slot_id: tl.constexpr, + # Meta-parameters + HAS_BIAS: tl.constexpr, + KERNEL_WIDTH: tl.constexpr, + SILU_ACTIVATION: tl.constexpr, + IS_CONTINUOUS_BATCHING: tl.constexpr, + IS_SPEC_DECODING: tl.constexpr, + NP2_STATELEN: tl.constexpr, + USE_PAD_SLOT: tl.constexpr, + BLOCK_N: tl.constexpr, +): + # ruff: noqa: E501 + # idx_seq = tl.program_id(0) + + idx_seq = batch_id + if idx_seq >= batch: + return + + # [BLOCK_N,] elements along the feature-dimension (channel) + idx_feats = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N) + + if IS_CONTINUOUS_BATCHING: + # mask = idx_seq < batch + conv_state_batch_coord = tl.load(conv_state_indices_ptr + + idx_seq * stride_state_indices).to( + tl.int64) + else: + conv_state_batch_coord = idx_seq + if USE_PAD_SLOT: # noqa + if conv_state_batch_coord == pad_slot_id: + # not processing as this is not the actual sequence + return + + if IS_SPEC_DECODING: + # The rolling of conv state: + # + # Before forward, the conv_state is: + # [history1, history2, ..., historyM]. + # + # After forward, the conv_state becomes: + # [history2, ..., historyM, draft1, draft2, ..., draftN]. + # + # After acceptance, it becomes: + # + # - accept 1 tokens: [history2, ..., historyM, draft1] + # - accept 2 tokens: [history3, ..., historyM, draft1, draft2] + # - and so on. + conv_state_token_offset = (tl.load(num_accepted_tokens_ptr + idx_seq) - + 1) + else: + conv_state_token_offset = 0 + + # STEP 1: READ init_state data + conv_states_base = (conv_state_ptr + + (conv_state_batch_coord * stride_conv_state_seq) + + (idx_feats * stride_conv_state_dim)) + mask_w = idx_feats < dim + + prior_tokens = conv_states_base + conv_state_token_offset * stride_conv_state_tok + if KERNEL_WIDTH >= 2: + conv_states_ptrs = prior_tokens # [BLOCK_N] + col0 = tl.load(conv_states_ptrs, mask_w, 0.0) + if KERNEL_WIDTH >= 3: + conv_states_ptrs = prior_tokens + 1 * stride_conv_state_tok # [BLOCK_N] + col1 = tl.load(conv_states_ptrs, mask_w, 0.0) + if KERNEL_WIDTH >= 4: + conv_states_ptrs = prior_tokens + 2 * stride_conv_state_tok # [BLOCK_N] + col2 = tl.load(conv_states_ptrs, mask_w, 0.0) + if KERNEL_WIDTH == 5: + conv_states_ptrs = prior_tokens + 3 * stride_conv_state_tok # [BLOCK_N] + col3 = tl.load(conv_states_ptrs, mask_w, 0.0) + + # STEP 2: assume state_len > seqlen + idx_tokens = tl.arange(0, NP2_STATELEN) # [BLOCK_M] + + # With speculative decoding, the conv_state updates works in a sliding + # window manner, at each forward pass, the tokens are shift by 1, so we + # load since idx_tokens + 1. + conv_state_ptrs_source = ( + conv_state_ptr + (conv_state_batch_coord * stride_conv_state_seq) + + conv_state_token_offset * stride_conv_state_tok + + (idx_feats * stride_conv_state_dim)[None, :] + + ((idx_tokens + (1 if IS_SPEC_DECODING else seqlen)) * + stride_conv_state_tok)[:, None]) # [BLOCK_M, BLOCK_N] + mask = ((conv_state_batch_coord < num_cache_lines) + & ((idx_tokens + seqlen) < state_len)[:, None] + & (idx_feats < dim)[None, :]) + conv_state = tl.load(conv_state_ptrs_source, mask, other=0.0) + + VAL = state_len - seqlen + x_base = x_ptr + (idx_seq * stride_x_seq) + (idx_feats * stride_x_dim + ) # [BLOCK_N] + + x_ptrs = x_base[None, :] + ( + (idx_tokens - VAL) * stride_x_token)[:, None] # [BLOCK_M, BLOCK_N] + + mask_x = ((idx_tokens - VAL >= 0)[:, None] & + (idx_tokens - VAL < seqlen)[:, None] & (idx_feats < dim)[None, :] + ) # token-index # token-index # feature-index + loaded_x = tl.load(x_ptrs, mask_x, 0.0) + # tl.debug_barrier() + + new_conv_state = tl.where(mask, conv_state, loaded_x) + + conv_state_base = (conv_state_ptr + + (conv_state_batch_coord * stride_conv_state_seq) + + (idx_feats * stride_conv_state_dim)) # [BLOCK_N,] + conv_state_ptrs_target = conv_state_base + ( + idx_tokens * stride_conv_state_tok)[:, None] # [BLOCK_M, BLOCK_N] + mask = (idx_tokens < state_len)[:, None] & (idx_feats < dim)[None, :] + tl.store(conv_state_ptrs_target, new_conv_state, mask) + + # STEP 3: init accumulator + if HAS_BIAS: + bias = bias_ptr + idx_feats + mask_bias = idx_feats < dim + acc_preload = tl.load(bias, mask=mask_bias, + other=0.0).to(tl.float32) # [BLOCK_N] + else: + acc_preload = tl.zeros((BLOCK_N, ), dtype=tl.float32) + + # STEP 4: + # PRE-LOAD WEIGHTS + # first kernel column, configured for weights to handle BLOCK_N features in range + w_base = w_ptr + (idx_feats * stride_w_dim) # [BLOCK_N,] + mask_w = idx_feats < dim + if KERNEL_WIDTH >= 2: + w_ptrs = w_base + (0 * stride_w_width) # [BLOCK_N] tensor + w_col0 = tl.load(w_ptrs, mask_w, other=0.0) + w_ptrs = w_base + (1 * stride_w_width) # [BLOCK_N] tensor + w_col1 = tl.load(w_ptrs, mask_w, other=0.0) + if KERNEL_WIDTH >= 3: + w_ptrs = w_base + (2 * stride_w_width) # [BLOCK_N] tensor + w_col2 = tl.load(w_ptrs, mask_w, other=0.0) + if KERNEL_WIDTH >= 4: + w_ptrs = w_base + (3 * stride_w_width) # [BLOCK_N] tensor + w_col3 = tl.load(w_ptrs, mask_w, other=0.0) + + x_base_1d = x_base # starting of chunk [BLOCK_N] + mask_x_1d = idx_feats < dim + + # STEP 5: compute each token + for idx_token in tl.static_range(seqlen): + acc = acc_preload + + matrix_w = w_col0 + matrix_x = col0 + for j in tl.static_range(KERNEL_WIDTH): + if KERNEL_WIDTH == 2: + if j == 1: # KERNEL_WIDTH-1: + matrix_w = w_col1 + x_ptrs_1d = x_base_1d + idx_token * stride_x_token # [BLOCK_N] + matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d) + elif KERNEL_WIDTH == 3: + if j == 1: + matrix_w = w_col1 + matrix_x = col1 + elif j == 2: + matrix_w = w_col2 + x_ptrs_1d = x_base_1d + idx_token * stride_x_token # [BLOCK_N] + matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d) + elif KERNEL_WIDTH == 4: + if j == 1: + matrix_w = w_col1 + matrix_x = col1 + elif j == 2: + matrix_w = w_col2 + matrix_x = col2 + elif j == 3: + matrix_w = w_col3 + x_ptrs_1d = x_base_1d + idx_token * stride_x_token # [BLOCK_N] + matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d) + + acc += matrix_x * matrix_w # [BLOCK_N] + + if KERNEL_WIDTH == 2: + col0 = matrix_x + elif KERNEL_WIDTH == 3: + col0 = col1 + col1 = matrix_x + elif KERNEL_WIDTH == 4: + col0 = col1 + col1 = col2 + col2 = matrix_x + + if SILU_ACTIVATION: + acc = acc / (1 + tl.exp(-acc)) + mask_1d = (idx_token < seqlen) & (idx_feats < dim + ) # token-index # feature-index + o_ptrs = o_ptr + ( + idx_seq) * stride_o_seq + idx_token * stride_o_token + ( + idx_feats * stride_o_dim) + + tl.store(o_ptrs, acc, mask=mask_1d) + + +def causal_conv1d_update( + x: torch.Tensor, + conv_state: torch.Tensor, + weight: torch.Tensor, + bias: Optional[torch.Tensor] = None, + activation: Union[bool, str, None] = None, + cache_seqlens: Optional[torch.Tensor] = None, + conv_state_indices: Optional[torch.Tensor] = None, + num_accepted_tokens: Optional[torch.Tensor] = None, + pad_slot_id: int = PAD_SLOT_ID, + metadata=None, + validate_data=False, +): + """ + x: (batch, dim) or (batch, dim, seqlen) + [shape=2: single token prediction] + [shape=3: single or multiple tokens prediction] + conv_state: (..., dim, state_len), where state_len >= width - 1 + weight: (dim, width) + bias: (dim,) + cache_seqlens: (batch,), dtype int32. + If not None, the conv_state is treated as a circular buffer. + The conv_state will be updated by copying x to the conv_state + starting at the index + @cache_seqlens % state_len. + conv_state_indices: (batch,), dtype int32 + If not None, the conv_state is a larger tensor along the batch dim, + and we are selecting the batch coords specified by conv_state_indices. + Useful for a continuous batching scenario. + pad_slot_id: int + if cache_indices is passed, lets the kernel identify padded + entries that will not be processed, + for example: cache_indices = [pad_slot_id, 1 ,20 ,pad_slot_id] + in this case, the kernel will not process entries at + indices 0 and 3 + out: (batch, dim) or (batch, dim, seqlen) + """ + if validate_data: + assert cache_seqlens is None # not implemented yet - ok for vLLM + assert pad_slot_id is not None + assert x.stride(1) == 1 + if isinstance(activation, bool): + activation = "silu" if activation is True else None + elif activation is not None: + assert activation in ["silu", "swish"] + unsqueeze = x.dim() == 2 + if unsqueeze: + # make it (batch, dim, seqlen) with seqlen == 1 + x = x.unsqueeze(-1) + batch, dim, seqlen = x.shape + _, width = weight.shape + # conv_state: (..., dim, state_len), where state_len >= width - 1 + num_cache_lines, _, state_len = conv_state.size() + + if validate_data: + assert dim == weight.size(0) + assert conv_state.stride( + -2 + ) == 1, f"ERROR: expect contiguous along feat-dim of conv_state (currently stride={conv_state.stride()})" + assert state_len >= width - 1 + # when above happens, we don't shift-left to keep any records in conv_state + assert dim == conv_state.size(1) + if conv_state_indices is None: + assert conv_state.size(0) >= batch + else: + assert (batch, ) == conv_state_indices.shape + + assert num_cache_lines >= batch + assert weight.stride(1) == 1 # Need this + assert cache_seqlens is None # not needed for vLLM - circular buffer + + # adopt the strategy in vLLM that overwrite on 'x' directly, rather than creating a new tensor 'o' + out = x + stride_w_dim, stride_w_width = weight.stride() + + stride_x_seq, stride_x_dim, stride_x_token = x.stride( + ) # X (batch, dim, seqlen) + + stride_o_seq, stride_o_dim, stride_o_token = out.stride() + stride_istate_seq, stride_istate_dim, stride_istate_token = conv_state.stride( + ) + stride_state_indices = conv_state_indices.stride( + 0) if conv_state_indices is not None else 0 + if num_accepted_tokens is not None: + state_len = width - 1 + (seqlen - 1) # effective state_len needed + else: + state_len = width - 1 + np2_statelen = triton.next_power_of_2(state_len) + + def grid(META): + return ( + 1, + triton.cdiv(dim, META["BLOCK_N"]), + ) + for batch_id in range(batch): + _causal_conv1d_update_kernel_xpu[grid]( + x, + weight, + bias, + conv_state, + cache_seqlens, + conv_state_indices, + num_accepted_tokens, + out, + batch_id=batch_id, + batch=batch, + dim=dim, + seqlen=seqlen, + state_len=state_len, + num_cache_lines=num_cache_lines, + stride_x_seq=stride_x_seq, + stride_x_dim=stride_x_dim, + stride_x_token=stride_x_token, + stride_w_dim=stride_w_dim, + stride_w_width=stride_w_width, + stride_conv_state_seq=stride_istate_seq, + stride_conv_state_dim=stride_istate_dim, + stride_conv_state_tok=stride_istate_token, + stride_state_indices=stride_state_indices, + stride_o_seq=stride_o_seq, + stride_o_dim=stride_o_dim, + stride_o_token=stride_o_token, + pad_slot_id=pad_slot_id, + HAS_BIAS=bias is not None, + KERNEL_WIDTH=width, + SILU_ACTIVATION=activation in ["silu", "swish"], + IS_CONTINUOUS_BATCHING=conv_state_indices is not None, + IS_SPEC_DECODING=num_accepted_tokens is not None, + NP2_STATELEN=np2_statelen, + USE_PAD_SLOT=pad_slot_id is not None, + BLOCK_N=256, + groups_per_cluster=np2_statelen, + isCloseUnrollControl=True, + isCloseVectorization=True, + isCloseOffsetAnalysis=True, + is_use_mask_zero = True + ) + if unsqueeze: + out = out.squeeze(-1) + return out diff --git a/vllm_kunlun/ops/paged_attn.py b/vllm_kunlun/ops/paged_attn.py index 1775b4a..b2bcd14 100644 --- a/vllm_kunlun/ops/paged_attn.py +++ b/vllm_kunlun/ops/paged_attn.py @@ -24,7 +24,6 @@ _PARTITION_SIZE = 512 @dataclass class PagedAttentionMetadata: """Metadata for PagedAttention.""" - # (batch_size,). The length of sequences (entire tokens seen so far) per # sequence. seq_lens_tensor: Optional[torch.Tensor] @@ -53,18 +52,18 @@ class PagedAttention: head_size: int, ) -> Tuple[int, ...]: """ - Get the shape of the KV cache. Returns different shapes based on whether the computation is on-chip. - If on-chip (is_kunlun() is True), returns shape (2, num_blocks, num_kv_heads, block_size, head_size); - Otherwise, returns shape (2, num_blocks, block_size * num_kv_heads * head_size). - + 获取KV缓存的形状,根据是否在芯片上进行计算返回不同的形状。 + 如果在芯片上(is_kunlun()为True),则返回形状(2, num_blocks, num_kv_heads, block_size, head_size); + 否则,返回形状(2, num_blocks, block_size * num_kv_heads * head_size)。 + Args: - num_blocks (int): The number of blocks. - block_size (int): The size of each block. - num_kv_heads (int): The number of KV heads. - head_size (int): The size of each head. - + num_blocks (int): 块数量。 + block_size (int): 每个块大小。 + num_kv_heads (int): KV头数量。 + head_size (int): 每个头大小。 + Returns: - Tuple[int, ...]: The shape of the KV cache, including two elements: the first element is 2, indicating the number of dimensions is 2; the second element is one of num_blocks, num_kv_heads, block_size, and head_size. + Tuple[int, ...]: KV缓存的形状,包括两个元素:第一个元素为2,表示维度数量为2;第二个元素为num_blocks、num_kv_heads、block_size和head_size中的任意一个。 """ if current_platform.is_kunlun(): return (2, num_blocks, num_kv_heads, block_size, head_size) @@ -77,20 +76,20 @@ class PagedAttention: head_size: int, ) -> Tuple[torch.Tensor, torch.Tensor]: """ - Split a cached tensor (containing key and value) into two parts, each part is a tensor. - If running on KUNLUN, the first returned tensor is the key cache, and the second tensor is the value cache. - Otherwise, the first tensor is the key cache, and the second tensor is a view of the key cache with shape (num_blocks, num_kv_heads, head_size//x, -1, x), - and the third tensor is the value cache with shape (num_blocks, num_kv_heads, head_size, -1). - + 将一个缓存张量(包含key和value)分成两部分,每个部分是一个张量。 + 如果在KUNLUN上运行,则返回的第一个张量是key缓存,第二个张量是value缓存。 + 否则,第一个张量是key缓存,第二个张量是key缓存的view,其形状为(num_blocks, num_kv_heads, head_size//x, -1, x), + 第三个张量是value缓存,其形状为(num_blocks, num_kv_heads, head_size, -1)。 + Args: - kv_cache (torch.Tensor): A tensor containing key and value, with shape (2, num_blocks, kv_cache_size). - num_kv_heads (int): The number of heads in multi-head attention. - head_size (int): The size of each head. - + kv_cache (torch.Tensor): 包含key和value的张量,形状为(2, num_blocks, kv_cache_size)。 + num_kv_heads (int): 多头注意力中的头数。 + head_size (int): 每个头的大小。 + Returns: Tuple[torch.Tensor, torch.Tensor]: - - key_cache (torch.Tensor): A tensor containing the key cache, with shape (num_blocks, num_kv_heads, head_size//x, -1, x). - - value_cache (torch.Tensor): A tensor containing the value cache, with shape (num_blocks, num_kv_heads, head_size, -1). + - key_cache (torch.Tensor): 形状为(num_blocks, num_kv_heads, head_size//x, -1, x),包含key缓存。 + - value_cache (torch.Tensor): 形状为(num_blocks, num_kv_heads, head_size, -1),包含value缓存。 """ x = 16 // kv_cache.element_size() num_blocks = kv_cache.shape[1] @@ -100,7 +99,8 @@ class PagedAttention: value_cache = kv_cache[1] else: key_cache = kv_cache[0] - key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x, -1, x) + key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x, + -1, x) value_cache = kv_cache[1] value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1) return key_cache, value_cache @@ -152,17 +152,16 @@ class PagedAttention: if blocksparse_vert_stride is not None and blocksparse_vert_stride > 1: # use blocksparse paged attention block_size = value_cache.size(-1) - assert ( - blocksparse_block_size > 0 and blocksparse_block_size % block_size == 0 - ), ( - f"{blocksparse_block_size=} needs to be a multiple of" - f"{block_size=} used in block_tables." - ) + assert (blocksparse_block_size > 0 and + blocksparse_block_size % block_size == 0), \ + (f"{blocksparse_block_size=} needs to be a multiple of" + f"{block_size=} used in block_tables.") output = torch.empty_like(query) block_size = value_cache.shape[3] num_seqs, num_heads, head_size = query.shape - max_num_partitions = (max_seq_len + _PARTITION_SIZE - 1) // _PARTITION_SIZE + max_num_partitions = ((max_seq_len + _PARTITION_SIZE - 1) // + _PARTITION_SIZE) # NOTE(woosuk): We use a simple heuristic to decide whether to use # PagedAttention V1 or V2. If the number of partitions is 1, we use # V1 to avoid the overhead of reduction. Also, if the number of @@ -170,10 +169,9 @@ class PagedAttention: # to parallelize. # TODO(woosuk): Tune this heuristic. # For context len > 8192, use V2 kernel to avoid shared memory shortage. - use_v1 = max_seq_len <= 8192 and ( - max_num_partitions == 1 or num_seqs * num_heads > 512 - ) - + use_v1 = (max_seq_len <= 8192 + and (max_num_partitions == 1 or num_seqs * num_heads > 512)) + if use_v1: # Run PagedAttention V1. ops.paged_attention_v1( @@ -302,4 +300,4 @@ class PagedAttention: ) -> None: key_caches = [kv_cache[0] for kv_cache in kv_caches] value_caches = [kv_cache[1] for kv_cache in kv_caches] - ops.copy_blocks(key_caches, value_caches, src_to_dists) + ops.copy_blocks(key_caches, value_caches, src_to_dists) \ No newline at end of file diff --git a/vllm_kunlun/ops/quantization/awq.py b/vllm_kunlun/ops/quantization/awq.py deleted file mode 100644 index e7b65bc..0000000 --- a/vllm_kunlun/ops/quantization/awq.py +++ /dev/null @@ -1,128 +0,0 @@ -# -# Copyright (c) 2025 Baidu, Inc. All Rights Reserved. -# Author: Li Wei, Pan Xiakai, You Zeyu -# Email: liwei157@baidu.com -# This file is a part of the vllm-kunlun project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - -from typing import Optional -from vllm.model_executor.layers.quantization.awq import AWQLinearMethod - - -def repack_int4_for_kunlun(self, packed: torch.Tensor, num_bits: int = 4): - """Convert AWQ-packed int4 weights to Kunlun XPU format. - Input: packed[N, K], dtype=int32, saved as AWQ order - Output: packed_reordered[N, K], dtype=int32, saved as Kunlun order - """ - N, K = packed.shape - self.align_type = 1 if K % 8 == 0 else 0 - assert num_bits == 4, "Only int4 supported now" - shifts = torch.arange(0, 32, num_bits, device=packed.device, dtype=torch.int32) - - if self.align_type == 0: # NORMAL MODE - # Unpack AWQ order:[0, 2, 4, 6, 1, 3, 5, 7] - unpacked_awq = (packed.unsqueeze(-1) >> shifts) & 0xF # [N, K, 8] - - # Reverse AWQ order and convert to KUNLUN order - AWQ_TO_KUNLUN_ORDER_NORMAL = [4, 0, 5, 1, 6, 2, 7, 3] - # [0,2,4,6,1,3,5,7] --> [1, 0, 3, 2, 5, 4, 7, 6] - unpacked_kunlun = unpacked_awq[..., AWQ_TO_KUNLUN_ORDER_NORMAL] # [N, K, 8] - - # Pack to int32, order[6, 7, 4, 5, 2, 3, 0, 1] - packed_kunlun = (unpacked_kunlun << shifts).sum( - dim=-1, dtype=torch.int32 - ) # [N, K] - elif self.align_type == 1: # FAST MODEL - # Unpack AWQ order - unpacked_awq = ( - packed.view(N, K // 8, 8).unsqueeze(-1) >> shifts - ) & 0xF # [N, K//8, 8, 8] - - # Reverse AWQ order and convert to KUNLUN order - AWQ_TO_KUNLUN_ORDER_FAST = [ - 32, 0, 36, 4, 33, 1, 37, 5, - 34, 2, 38, 6, 35, 3, 39, 7, - 40, 8, 44, 12, 41, 9, 45, 13, - 42, 10, 46, 14, 43, 11, 47, 15, - 48, 16, 52, 20, 49, 17, 53, 21, - 50, 18, 54, 22, 51, 19, 55, 23, - 56, 24, 60, 28, 57, 25, 61, 29, - 58, 26, 62, 30, 59, 27, 63, 31 - ] - unpacked_awq = unpacked_awq.reshape(N, K // 8, 64) - unpacked_kunlun = unpacked_awq[..., AWQ_TO_KUNLUN_ORDER_FAST] # [N, K//8, 64] - - # Pack to int32 - unpacked_kunlun = unpacked_kunlun.reshape(N, K // 8, 8, 8) - packed_kunlun = ( - (unpacked_kunlun << shifts).sum(dim=-1, dtype=torch.int32).reshape(N, K) - ) # [N, K] - else: - raise NotImplementedError - - return packed_kunlun - - -def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - layer.qweight = torch.nn.Parameter( - ( - self.repack_int4_for_kunlun(layer.qweight.data) - if layer.qweight.data.dtype == torch.int32 - else layer.qweight.data - ), - requires_grad=False, - ) - layer.qzeros = torch.nn.Parameter( - ( - self.repack_int4_for_kunlun(layer.qzeros.data) - if layer.qzeros.data.dtype == torch.int32 - else layer.qzeros.data - ), - requires_grad=False, - ) - layer.scales = torch.nn.Parameter(layer.scales.data, requires_grad=False) - - -def apply( - self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None -) -> torch.Tensor: - qweight = layer.qweight - scales = layer.scales - qzeros = layer.qzeros - pack_factor = self.quant_config.pack_factor - out_shape = x.shape[:-1] + (qweight.shape[-1] * pack_factor,) - reshaped_x = x.reshape(-1, x.shape[-1]) - - # num_tokens >= threshold - FP16_MATMUL_HEURISTIC_CONDITION = x.shape[:-1].numel() >= 256 - - if FP16_MATMUL_HEURISTIC_CONDITION: - out = torch.ops._C.awq_dequantize( - qweight, scales, qzeros, quant_type=0, align_type=self.align_type - ) - out = torch.matmul(reshaped_x, out) - else: - out = torch.ops._C.awq_gemm( - reshaped_x, qweight, scales, qzeros, align_type=self.align_type - ) - if bias is not None: - out.add_(bias) - return out.reshape(out_shape) - - -AWQLinearMethod.repack_int4_for_kunlun = repack_int4_for_kunlun -AWQLinearMethod.process_weights_after_loading = process_weights_after_loading -AWQLinearMethod.apply = apply diff --git a/vllm_kunlun/ops/quantization/compressed_tensors_moe.py b/vllm_kunlun/ops/quantization/compressed_tensors_moe.py index ff96c23..7b06bc5 100644 --- a/vllm_kunlun/ops/quantization/compressed_tensors_moe.py +++ b/vllm_kunlun/ops/quantization/compressed_tensors_moe.py @@ -1,37 +1,14 @@ -# -# Copyright (c) 2025 Baidu, Inc. All Rights Reserved. -# -# This file is a part of the vllm-kunlun project. -# Author: Chen Zhennan, Dong Xinyu -# Email: chenzhennan@baidu.com -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - import torch from typing import Any, Literal, Optional, cast, Callable, Optional -from compressed_tensors.config import ( - CompressionFormat, - SparsityCompressionConfig, - SparsityStructure, -) -from compressed_tensors.quantization import ActivationOrdering, QuantizationStrategy -from vllm.model_executor.layers.fused_moe import ( - FusedMoE, - FusedMoEMethodBase, - FusedMoeWeightScaleSupported, -) +from compressed_tensors.config import (CompressionFormat, + SparsityCompressionConfig, + SparsityStructure) +from compressed_tensors.quantization import (ActivationOrdering, + QuantizationStrategy) +from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase, + FusedMoeWeightScaleSupported) from vllm.model_executor.layers.quantization.utils import replace_parameter - # TODO: import position will be changed after 0.9.0 # vllm.model_executor.layers.fused_moe.fused_moe --> vllm.model_executor.layers.fused_moe @@ -42,7 +19,6 @@ import xtorch_ops from safetensors.torch import load_file as safe_load_file - class CompressedTensorsMoEMethod(FusedMoEMethodBase): def get_moe_method(quant_config, layer) -> "CompressedTensorsMoEMethod": @@ -50,239 +26,177 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase): linear_cfg = None for k in ("Linear", "FusedMoE", "MoE", "Moe", "Experts"): if k in tsm and isinstance(tsm[k], dict): - linear_cfg = tsm[k] - break + linear_cfg = tsm[k]; break if not linear_cfg: # print("target_scheme_map missing; fallback to INT8(W8A8) method") return CompressedTensorsW8A8Int8MoEMethod(quant_config) - wq = linear_cfg.get("weights") - aq = linear_cfg.get("input_activations") + wq = linear_cfg.get("weights"); aq = linear_cfg.get("input_activations") if not wq or not aq: # print("incomplete scheme; fallback to INT8(W8A8)") return CompressedTensorsW8A8Int8MoEMethod(quant_config) - - # Other branches are handled as needed; default fallback: + # 其它分流按需;默认回落: return CompressedTensorsW8A8Int8MoEMethod(quant_config) - # copied from vllm 0.9.0 class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): def __init__( - self, quant_config: "CompressedTensorsConfig" # type: ignore # noqa E501 + self, + quant_config: "CompressedTensorsConfig" # type: ignore # noqa E501 ): self.quant_config = quant_config - - # Directly create a default quantization config dictionary to avoid validation issues with QuantizationArgs + + # 直接创建默认的量化配置字典,避免 QuantizationArgs 的验证问题 # print("Creating default INT8 quantization config for MoE") + + # 创建默认的权重量化配置字典 + self.weight_quant = type('WeightQuant', (), { + 'type': 'int', + 'num_bits': 8, + 'strategy': 'channel', + 'group_size': 128, + 'symmetric': True, + 'dynamic': False, + 'actorder': 'none', + 'observer': None, + 'observer_kwargs': {}, + 'block_structure': None + })() + + # 创建默认的输入激活量化配置字典 + self.input_quant = type('InputQuant', (), { + 'type': 'int', + 'num_bits': 8, + 'strategy': 'token', + 'group_size': 128, + 'symmetric': True, + 'dynamic': True, + 'actorder': 'none', + 'observer': None, + 'observer_kwargs': {}, + 'block_structure': None + })() - # Create a default weight quantization config dictionary - self.weight_quant = type( - "WeightQuant", - (), - { - "type": "int", - "num_bits": 8, - "strategy": "channel", - "group_size": 128, - "symmetric": True, - "dynamic": False, - "actorder": "none", - "observer": None, - "observer_kwargs": {}, - "block_structure": None, - }, - )() - - # Create a default input activation quantization config dictionary - self.input_quant = type( - "InputQuant", - (), - { - "type": "int", - "num_bits": 8, - "strategy": "token", - "group_size": 128, - "symmetric": True, - "dynamic": True, - "actorder": "none", - "observer": None, - "observer_kwargs": {}, - "block_structure": None, - }, - )() - - # Change comparison method to directly compare strings + # 修改比较方式,直接比较字符串 per_channel = ( self.weight_quant.strategy == "channel" - and self.input_quant.strategy == "token" - ) + and self.input_quant.strategy == "token") if not per_channel: raise ValueError( "For INT8 Fused MoE layers, we require channelwise, " "dynamic per token quantization. Found " - f"{self.weight_quant}, {self.input_quant}" - ) + f"{self.weight_quant}, {self.input_quant}") self.static_input_scales = not self.input_quant.dynamic if self.static_input_scales: raise ValueError( "For INT8 Fused MoE layers, we require channelwise, " - "dynamic per token quantization. Found static input scales." - ) + "dynamic per token quantization. Found static input scales.") - def create_weights1( - self, - layer: torch.nn.Module, - num_experts: int, - hidden_size: int, - intermediate_size_per_partition: int, - params_dtype: torch.dtype, - **extra_weight_attrs, - ): - # Use float32 as a placeholder for weights to facilitate loading original weights from ckpt - w13_weight = torch.nn.Parameter( - torch.empty( - num_experts, - 2 * intermediate_size_per_partition, - hidden_size, - dtype=params_dtype, - ), # generally is torch.bfloat16 - requires_grad=False, - ) + def create_weights1(self, layer: torch.nn.Module, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, params_dtype: torch.dtype, **extra_weight_attrs): + # 权重先用浮点占位,便于从 ckpt 加载原始权重 + w13_weight = torch.nn.Parameter(torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size, + dtype=params_dtype), # 通常是 torch.bfloat16 + requires_grad=False) layer.register_parameter("w13_weight", w13_weight) set_weight_attrs(w13_weight, extra_weight_attrs) - w2_weight = torch.nn.Parameter( - torch.empty( - num_experts, - hidden_size, - intermediate_size_per_partition, - dtype=params_dtype, - ), - requires_grad=False, - ) + w2_weight = torch.nn.Parameter(torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition, + dtype=params_dtype), + requires_grad=False) layer.register_parameter("w2_weight", w2_weight) set_weight_attrs(w2_weight, extra_weight_attrs) - # Channel scale: float32 + 2D [E, out] (aligned with fused_moe/UT) + # 通道 scale:float32 + 二维 [E, out](与 fused_moe/UT 对齐) w13_weight_scale = torch.nn.Parameter( - torch.empty( - num_experts, 2 * intermediate_size_per_partition, dtype=torch.float32 - ), - requires_grad=False, - ) + torch.empty(num_experts, 2 * intermediate_size_per_partition, dtype=torch.float32), + requires_grad=False) w2_weight_scale = torch.nn.Parameter( torch.empty(num_experts, hidden_size, dtype=torch.float32), - requires_grad=False, - ) + requires_grad=False) layer.register_parameter("w13_weight_scale", w13_weight_scale) layer.register_parameter("w2_weight_scale", w2_weight_scale) - # Input scale can be dynamically calculated + # 输入 scale 动态计算即可 layer.w13_input_scale = None layer.w2_input_scale = None - def create_weights( - self, - layer: torch.nn.Module, - num_experts: int, - hidden_size: int, - intermediate_size_per_partition: int, - params_dtype: torch.dtype, - **extra_weight_attrs, - ): - w13_weight = torch.nn.Parameter( - torch.empty( - num_experts, - 2 * intermediate_size_per_partition, - hidden_size, - dtype=torch.int8, - ), # directly use int8 - requires_grad=False, - ) + def create_weights(self, layer: torch.nn.Module, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, params_dtype: torch.dtype, **extra_weight_attrs): + w13_weight = torch.nn.Parameter(torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size, + dtype=torch.int8), # 直接使用 int8 + requires_grad=False) layer.register_parameter("w13_weight", w13_weight) set_weight_attrs(w13_weight, extra_weight_attrs) - w2_weight = torch.nn.Parameter( - torch.empty( - num_experts, - hidden_size, - intermediate_size_per_partition, - dtype=torch.int8, - ), # directly use int8 - requires_grad=False, - ) + w2_weight = torch.nn.Parameter(torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition, + dtype=torch.int8), # 直接使用 int8 + requires_grad=False) layer.register_parameter("w2_weight", w2_weight) set_weight_attrs(w2_weight, extra_weight_attrs) - # Scale factors + # 缩放因子 w13_weight_scale = torch.nn.Parameter( - torch.empty( - num_experts, 2 * intermediate_size_per_partition, dtype=torch.float32 - ), - requires_grad=False, - ) + torch.empty(num_experts, 2 * intermediate_size_per_partition, dtype=torch.float32), + requires_grad=False) w2_weight_scale = torch.nn.Parameter( torch.empty(num_experts, hidden_size, dtype=torch.float32), - requires_grad=False, - ) + requires_grad=False) layer.register_parameter("w13_weight_scale", w13_weight_scale) layer.register_parameter("w2_weight_scale", w2_weight_scale) - # Input scale can be dynamically calculated + # 输入 scale 动态计算 layer.w13_input_scale = None layer.w2_input_scale = None - + @torch.no_grad() def process_weights_after_loading(self, layer: torch.nn.Module) -> None: return - # Convert original weights to float32 for more robust statistics + #原始权重转 float32 做统计更稳健 w13_f = layer.w13_weight.float() - w2_f = layer.w2_weight.float() + w2_f = layer.w2_weight.float() - # Each column (abs_max) -> per-column scale (out dimension is dim=1, column is dim=-1) + # 每列(abs_max) -> per-column scale(out 维在 dim=1,列在 dim=-1) qmax = 127.0 w13_abs_max = torch.amax(torch.abs(w13_f), dim=-1) # [E, 2N] - w2_abs_max = torch.amax(torch.abs(w2_f), dim=-1) # [E, H] + w2_abs_max = torch.amax(torch.abs(w2_f), dim=-1) # [E, H] w13_scale_2d = torch.clamp(w13_abs_max, min=1e-6) / qmax # [E, 2N], float32 - w2_scale_2d = torch.clamp(w2_abs_max, min=1e-6) / qmax # [E, H], float32 + w2_scale_2d = torch.clamp(w2_abs_max, min=1e-6) / qmax # [E, H], float32 - # Quantization: broadcast 3D scale and store back to 2D scale + # 量化:用 3D scale 广播,存回 2D scale w13_scale_3d = w13_scale_2d.unsqueeze(-1) # [E, 2N, 1] - w2_scale_3d = w2_scale_2d.unsqueeze(-1) # [E, H, 1] + w2_scale_3d = w2_scale_2d.unsqueeze(-1) # [E, H, 1] w13_q = torch.round(w13_f / w13_scale_3d).clamp_(-128, 127).to(torch.int8) - w2_q = torch.round(w2_f / w2_scale_3d).clamp_(-128, 127).to(torch.int8) + w2_q = torch.round(w2_f / w2_scale_3d ).clamp_(-128, 127).to(torch.int8) - # Optional: If your fused/kernel expects scale pre-multiplied by 127 (to be consistent with some UT backends), uncomment the following two lines: + # 可选:若你的 fused/kernel 期望 scale 预乘 127(与某些 UT 后端一致),打开下面两行: w13_scale_2d = w13_scale_2d * 127.0 - w2_scale_2d = w2_scale_2d * 127.0 + w2_scale_2d = w2_scale_2d * 127.0 - # Write back parameters: weight int8; scale uses float32 + 2D - replace_parameter( - layer, "w13_weight", torch.nn.Parameter(w13_q, requires_grad=False) - ) - replace_parameter( - layer, "w2_weight", torch.nn.Parameter(w2_q, requires_grad=False) - ) - replace_parameter( - layer, - "w13_weight_scale", - torch.nn.Parameter(w13_scale_2d.contiguous(), requires_grad=False), - ) - replace_parameter( - layer, - "w2_weight_scale", - torch.nn.Parameter(w2_scale_2d.contiguous(), requires_grad=False), - ) - - # Brief check - print( - f"w13: {w13_q.shape}, w13_s: {w13_scale_2d.shape}, w2: {w2_q.shape}, w2_s: {w2_scale_2d.shape}" - ) + # 回写参数:权重 int8;scale 用 float32 + 2D + replace_parameter(layer, 'w13_weight', torch.nn.Parameter(w13_q, requires_grad=False)) + replace_parameter(layer, 'w2_weight', torch.nn.Parameter(w2_q, requires_grad=False)) + replace_parameter(layer, 'w13_weight_scale', + torch.nn.Parameter(w13_scale_2d.contiguous(), requires_grad=False)) + replace_parameter(layer, 'w2_weight_scale', + torch.nn.Parameter(w2_scale_2d.contiguous(), requires_grad=False)) + # 简要检查 + print(f"w13: {w13_q.shape}, w13_s: {w13_scale_2d.shape}, w2: {w2_q.shape}, w2_s: {w2_scale_2d.shape}") + def apply( self, layer: torch.nn.Module, @@ -300,11 +214,11 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", - enable_eplb: bool = False, # Add this parameter - expert_load_view: Optional[torch.Tensor] = None, # Add this parameter - logical_to_physical_map: Optional[torch.Tensor] = None, # Add this parameter - logical_replica_count: Optional[torch.Tensor] = None, # Add this parameter - linear_weights: Optional[torch.Tensor] = None, # Add this parameter + enable_eplb: bool = False, # 添加这个参数 + expert_load_view: Optional[torch.Tensor] = None, # 添加这个参数 + logical_to_physical_map: Optional[torch.Tensor] = None, # 添加这个参数 + logical_replica_count: Optional[torch.Tensor] = None, # 添加这个参数 + linear_weights: Optional[torch.Tensor] = None, # 添加这个参数 ) -> torch.Tensor: output = torch.empty_like(x) @@ -326,8 +240,5 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): ) return output - -print( - "[Monkey Patch Applied] >>> vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsMoEMethod \ - --> vllm_xpu.model_executor.layers.quantization.compressed_tensors_moe.py:CompressedTensorsMoEMethod" -) +print("[Monkey Patch Applied] >>> vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsMoEMethod \ + --> vllm_xpu.model_executor.layers.quantization.compressed_tensors_moe.py:CompressedTensorsMoEMethod") \ No newline at end of file diff --git a/vllm_kunlun/ops/quantization/gptq.py b/vllm_kunlun/ops/quantization/gptq.py deleted file mode 100644 index e7fdba7..0000000 --- a/vllm_kunlun/ops/quantization/gptq.py +++ /dev/null @@ -1,108 +0,0 @@ -# -# Copyright (c) 2025 Baidu, Inc. All Rights Reserved. -# Author: Li Wei, You Zeyu -# Email: liwei157@baidu.com, youzeyu@baidu.com -# This file is a part of the vllm-kunlun project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - -from torch.nn.parameter import Parameter -from typing import Optional -from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod, ExllamaState - - -def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - # for torch.compile - layer.qzeros = Parameter( - self.repack_int4_for_kunlun(layer.qzeros.data, self.quant_config.weight_bits) - if self.quant_config.weight_bits == 4 else layer.qzeros.data, - requires_grad=False - ) - layer.qweight = Parameter(layer.qweight.data, requires_grad=False) - layer.g_idx = Parameter(layer.g_idx.data, requires_grad=False) - layer.scales = Parameter(layer.scales.data, requires_grad=False) - - # exllama needs to shuffle the weight after the weight is loaded - # here we do the shuffle on first forward pass - if layer.exllama_state == ExllamaState.UNINITIALIZED: - if self.quant_config.desc_act: - layer.g_idx.data = torch.argsort(layer.g_idx).to(torch.int) - else: - layer.g_idx.data = torch.empty((0, ), - dtype=torch.int, - device=layer.g_idx.device) - layer.exllama_state = ExllamaState.READY - - # No need shuffle on xpu - # ops.gptq_shuffle(layer.qweight, layer.g_idx, - # self.quant_config.weight_bits) - - -def repack_int4_for_kunlun(self, packed: torch.Tensor, num_bits: int = 4): - N, K = packed.shape - assert num_bits == 4, "Only int4 supported now" - shifts = torch.arange(0, 32, num_bits, device=packed.device, dtype=torch.int32) - - # Unpack int32 to int4 values - unpacked_gptq = ( - packed.view(N, K // 8, 8).unsqueeze(-1) >> shifts - ) & 0xF # [N, K//8, 8, 8] - - # Convert to KUNLUN order - GPTQ_TO_KUNLUN_ORDER_FAST = [ - 32, 0, 33, 1, 34, 2, 35, 3, - 36, 4, 37, 5, 38, 6, 39, 7, - 40, 8, 41, 9, 42, 10, 43, 11, - 44, 12, 45, 13, 46, 14, 47, 15, - 48, 16, 49, 17, 50, 18, 51, 19, - 52, 20, 53, 21, 54, 22, 55, 23, - 56, 24, 57, 25, 58, 26, 59, 27, - 60, 28, 61, 29, 62, 30, 63, 31, - ] - unpacked_gptq = unpacked_gptq.reshape(N, K // 8, 64) - unpacked_kunlun = unpacked_gptq[..., GPTQ_TO_KUNLUN_ORDER_FAST] # [N, K//8, 64] - - # Pack to int32 - unpacked_kunlun = unpacked_kunlun.reshape(N, K // 8, 8, 8) - packed_kunlun = ( - (unpacked_kunlun << shifts).sum(dim=-1, dtype=torch.int32).reshape(N, K) - ) # [N, K] - - return packed_kunlun - - -def apply( - self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None -) -> torch.Tensor: - out_shape = x.shape[:-1] + (layer.qweight.shape[-1], ) - reshaped_x = x.reshape(-1, x.shape[-1]) - - output = torch.ops.xspeedgate_ops.gptq_gemm( - reshaped_x, - layer.qweight, - layer.qzeros, - layer.scales, - layer.g_idx, - layer.exllama_state == ExllamaState.READY, - self.quant_config.weight_bits, - ) - if bias is not None: - output.add_(bias) - return output.reshape(out_shape) - - -GPTQLinearMethod.repack_int4_for_kunlun = repack_int4_for_kunlun -GPTQLinearMethod.process_weights_after_loading = process_weights_after_loading -GPTQLinearMethod.apply = apply diff --git a/vllm_kunlun/ops/rotary_embedding.py b/vllm_kunlun/ops/rotary_embedding.py index 3b5b813..2d65f3d 100644 --- a/vllm_kunlun/ops/rotary_embedding.py +++ b/vllm_kunlun/ops/rotary_embedding.py @@ -12,35 +12,33 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# This file is a part of the vllm-kunlun project. +# This file is a part of the vllm-ascend project. # import torch import xspeedgate_ops import os from vllm.model_executor.layers.rotary_embedding import ( - RotaryEmbedding, - YaRNScalingRotaryEmbedding, - DynamicNTKScalingRotaryEmbedding, - MRotaryEmbedding, -) + RotaryEmbedding, YaRNScalingRotaryEmbedding, DynamicNTKScalingRotaryEmbedding, MRotaryEmbedding) from typing import Optional, Tuple -import xtorch_ops - def vllm_kunlun_compute_cos_sin_cache(self) -> torch.Tensor: """Compute the cos and sin cache.""" inv_freq = self._compute_inv_freq(self.base) - if hasattr(self, "scaling_factor"): - self.max_position_embeddings = int( - self.max_position_embeddings * self.scaling_factor - ) + if hasattr(self, 'scaling_factor'): + self.max_position_embeddings = int(self.max_position_embeddings * self.scaling_factor) t = torch.arange(self.max_position_embeddings, dtype=torch.float) freqs = torch.einsum("i,j -> ij", t, inv_freq) cos = freqs.cos() sin = freqs.sin() - if os.getenv("FUSED_QK_ROPE_OP") == "1": + #对于glm4-9b-chat,rope跑forward_native,所以需要cache保持特定的形状,这里通过环境变量控制 + #对于qwen2.5-vl,rope跑mrope,也需要cache保持特定的形状 + #也就是说跑glm4-9b-chat、qwen2.5-vl,需要设置GLM4_CHAT环境变量为1 + if os.getenv('ROPE_NATIVE_2D') == "1": + cache = torch.cat((cos, sin), dim=-1) + return cache + if os.getenv('USE_ORI_ROPE') == "0": cache_cos = torch.cat((cos, cos), dim=-1) cache_sin = torch.cat((sin, sin), dim=-1) # [2, self.max_position_embeddings, self.rotary_dim * 2] @@ -51,89 +49,108 @@ def vllm_kunlun_compute_cos_sin_cache(self) -> torch.Tensor: def vllm_kunlun_forward_cuda( - self, - positions: torch.Tensor, - query: torch.Tensor, - key: Optional[torch.Tensor] = None, - offsets: Optional[torch.Tensor] = None, -) -> tuple[torch.Tensor, Optional[torch.Tensor]]: - """forward_cuda""" - from vllm_kunlun.ops._kunlun_ops import KunlunOps as ops + self, + positions: torch.Tensor, + query: torch.Tensor, + key: Optional[torch.Tensor] = None, + offsets: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + """forward_cuda""" + from vllm_kunlun.ops._kunlun_ops import KunlunOps as ops - if ( - self.cos_sin_cache.device != query.device - or self.cos_sin_cache.dtype != query.dtype - ): - self.cos_sin_cache = self.cos_sin_cache.to(query.device, dtype=query.dtype) - # ops.rotary_embedding()/batched_rotary_embedding() - # are in-place operations that update the query and key tensors. - if offsets is not None: - ops.batched_rotary_embedding( - positions, - query, - key, - self.head_size, - self.cos_sin_cache, - self.is_neox_style, - self.rotary_dim, - offsets, - ) + if self.cos_sin_cache.device != query.device or \ + self.cos_sin_cache.dtype != query.dtype: + self.cos_sin_cache = self.cos_sin_cache.to(query.device, + dtype=query.dtype) + # ops.rotary_embedding()/batched_rotary_embedding() + # are in-place operations that update the query and key tensors. + if offsets is not None: + ops.batched_rotary_embedding(positions, query, key, self.head_size, + self.cos_sin_cache, + self.is_neox_style, self.rotary_dim, + offsets) + else: + ops.rotary_embedding(positions, query, key, self.head_size, + self.cos_sin_cache, self.is_neox_style) + return query, key + +def apply_interleaved_rope(x: torch.Tensor, + mrope_section: list[int]) -> torch.Tensor: + """Apply interleaved MRoPE to 3D rotary embeddings. + Reorganizes frequency layout from chunked [TTT...HHH...WWW] to + interleaved [THTHWHTHW...TT], preserving frequency continuity. + """ + x_t = x[0].clone() + x_t[..., 1:mrope_section[1] * 3:3] = x[1, ..., 1:mrope_section[1] * 3:3] + x_t[..., 2:mrope_section[2] * 3:3] = x[2, ..., 2:mrope_section[2] * 3:3] + return x_t + +def vllm_kunlun_apply_rotary_emb(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, + is_neox_style: bool) -> torch.Tensor: + """ + Args: + x: [num_tokens, num_heads, head_size] + cos: [num_tokens, head_size // 2] + sin: [num_tokens, head_size // 2] + is_neox_style: Whether to use the Neox-style or GPT-J-style rotary + positional embeddings. + """ + cos = cos.unsqueeze(-2).to(x.dtype) + sin = sin.unsqueeze(-2).to(x.dtype) + if is_neox_style: + x1, x2 = torch.chunk(x, 2, dim=-1) else: - query, key = ops.rotary_embedding( - positions, - query, - key, - self.head_size, - self.cos_sin_cache, - self.is_neox_style, - ) - return query, key - + x1 = x[..., ::2] + x2 = x[..., 1::2] + o1 = x1 * cos - x2 * sin + o2 = x2 * cos + x1 * sin + if is_neox_style: + return torch.cat((o1, o2), dim=-1) + else: + return torch.stack((o1, o2), dim=-1).flatten(-2) def vllm_kunlun_mrope_forward_cuda( - self, - positions: torch.Tensor, - query: torch.Tensor, - key: Optional[torch.Tensor] = None, -) -> tuple[torch.Tensor, Optional[torch.Tensor]]: - """PyTorch-native implementation equivalent to forward(). + self, + positions: torch.Tensor, + query: torch.Tensor, + key: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + """PyTorch-native implementation equivalent to forward(). - Args: - positions: - [num_tokens,] (text only) or - [3, num_tokens] (T/H/W positions with multimodal inputs) - query: [num_tokens, num_heads * head_size] - key: [num_tokens, num_kv_heads * head_size] - """ + Args: + positions: + [num_tokens,] (text only) or + [3, num_tokens] (T/H/W positions with multimodal inputs) + query: [num_tokens, num_heads * head_size] + key: [num_tokens, num_kv_heads * head_size] + """ + assert positions.ndim == 2 + assert key is not None + + query, key = torch.ops.xspeedgate_ops.mrotary_embedding_fwd_v0( + query, + key, + positions.to(dtype=torch.int32), + self.cos_sin_cache, + self.mrope_interleaved, + self.is_neox_style, + self.head_size, + self.rotary_dim, + self.mrope_section[0], + self.mrope_section[1], + self.mrope_section[2] + ) - assert positions.ndim == 2 - assert key is not None + return query, key - query, key = torch.ops.xspeedgate_ops.mrotary_embedding_fwd_v0( - query, - key, - positions.to(dtype=torch.int32), - self.cos_sin_cache, - False, # self.mrope_interleaved, - self.head_size, - self.rotary_dim, - self.mrope_section[0], - self.mrope_section[1], - self.mrope_section[2], - ) - - return query, key - - -RotaryEmbedding.forward_cuda = vllm_kunlun_forward_cuda -RotaryEmbedding.forward = vllm_kunlun_forward_cuda -if os.getenv("KUNLUN_ENABLE_MULTI_LORA") == "1" or os.getenv("FUSED_QK_ROPE_OP") == "1": - RotaryEmbedding._compute_cos_sin_cache = vllm_kunlun_compute_cos_sin_cache -else: - pass +# RotaryEmbedding.forward_cuda = vllm_kunlun_forward_cuda +# RotaryEmbedding.forward = vllm_kunlun_forward_cuda +# RotaryEmbedding._compute_cos_sin_cache = vllm_kunlun_compute_cos_sin_cache MRotaryEmbedding.forward_cuda = vllm_kunlun_mrope_forward_cuda MRotaryEmbedding.forward = vllm_kunlun_mrope_forward_cuda +# MRotaryEmbedding._compute_cos_sin_cache = vllm_kunlun_compute_cos_sin_cache YaRNScalingRotaryEmbedding._compute_inv_freq = RotaryEmbedding._compute_inv_freq +# YaRNScalingRotaryEmbedding._compute_cos_sin_cache = vllm_kunlun_compute_cos_sin_cache def Split_Norm_Rope( @@ -145,36 +162,27 @@ def Split_Norm_Rope( max_position_embeddings: int, q_head_num: int, kv_head_num: int, - head_dim: int, - partial_rotary_factor: float = 1.0, + head_dim:int ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: num_tokens = qkv.shape[0] - rotary_dim = head_dim - if partial_rotary_factor < 1.0: - rotary_dim = int(rotary_dim * partial_rotary_factor) - q_emb_out = torch.empty( - (num_tokens, q_head_num * head_dim), dtype=qkv.dtype, device=qkv.device - ) - k_emb_out = torch.empty( - (num_tokens, kv_head_num * head_dim), dtype=qkv.dtype, device=qkv.device - ) - v_out = torch.empty( - (num_tokens, kv_head_num * head_dim), dtype=qkv.dtype, device=qkv.device - ) + rotary_dim=head_dim + q_emb_out = torch.empty((num_tokens, q_head_num * head_dim), dtype=qkv.dtype, device=qkv.device) + k_emb_out = torch.empty((num_tokens, kv_head_num * head_dim), dtype=qkv.dtype, device=qkv.device) + v_out = torch.empty((num_tokens, kv_head_num * head_dim), dtype=qkv.dtype, device=qkv.device) torch.ops._C.split_norm_rope_neox( - q_emb_out, - k_emb_out, - v_out, - qkv, - cos_sin_cache, - q_norm_weight, - k_norm_weight, - positions, - num_tokens, - max_position_embeddings, - q_head_num, - kv_head_num, - head_dim, - rotary_dim, - ) - return q_emb_out, k_emb_out, v_out + q_emb_out, + k_emb_out, + v_out, + qkv, + cos_sin_cache, + q_norm_weight, + k_norm_weight, + positions, + num_tokens, + max_position_embeddings, + q_head_num, + kv_head_num, + head_dim, + rotary_dim, + ) + return q_emb_out, k_emb_out, v_out diff --git a/vllm_kunlun/ops/sample/sampler.py b/vllm_kunlun/ops/sample/sampler.py index 7bada08..a6c9e4e 100644 --- a/vllm_kunlun/ops/sample/sampler.py +++ b/vllm_kunlun/ops/sample/sampler.py @@ -14,25 +14,20 @@ import torch.nn as nn import vllm.envs as envs from vllm.model_executor.layers.utils import apply_penalties -from vllm.model_executor.sampling_metadata import ( - SamplingMetadata, - SamplingTensors, - SequenceGroupToSample, -) +from vllm.model_executor.sampling_metadata import (SamplingMetadata, + SamplingTensors, + SequenceGroupToSample) from vllm.sampling_params import SamplingType -from vllm.sequence import ( - VLLM_INVALID_TOKEN_ID, - CompletionSequenceGroupOutput, - Logprob, - PromptLogprobs, - SampleLogprobs, - SequenceOutput, -) +from vllm.sequence import (VLLM_INVALID_TOKEN_ID, + CompletionSequenceGroupOutput, Logprob, + PromptLogprobs, SampleLogprobs, SequenceOutput) if envs.VLLM_USE_FLASHINFER_SAMPLER and find_spec("flashinfer"): # yapf: disable from flashinfer.sampling import ( - top_k_top_p_sampling_from_probs as flashinfer_top_k_top_p_sampling) # yapf: enable + top_k_top_p_sampling_from_probs as flashinfer_top_k_top_p_sampling) + + # yapf: enable else: flashinfer_top_k_top_p_sampling = None @@ -43,16 +38,15 @@ logger = init_logger(__name__) def get_sampler() -> torch.nn.Module: """ - Get a model for sampling, returning a type of torch.nn.Module. - If the environment variable VLLM_USE_V1 is set to True, the v1 version of the sampler is used; otherwise, the current version of the sampler is used. - + 获取一个用于采样的模型,返回类型为torch.nn.Module。 + 如果环境变量VLLM_USE_V1设置为True,则使用v1版本的采样器;否则使用当前版本的采样器。 + Returns: - torch.nn.Module (Union[Sampler, V1Sampler]): A model for sampling, which can be of type Sampler or V1Sampler. + torch.nn.Module (Union[Sampler, V1Sampler]): 一个用于采样的模型,可以是Sampler或V1Sampler类型。 """ if envs.VLLM_USE_V1: # Lazy import: the v1 package isn't distributed from vllm.v1.sample.sampler import Sampler as V1Sampler - return V1Sampler() return Sampler() @@ -62,7 +56,8 @@ SampleResultType = list[tuple[list[int], list[int]]] # Types of temporary data structures used for # computing sample_result -SampleMetadataType = dict[SamplingType, tuple[list[int], list[SequenceGroupToSample]]] +SampleMetadataType = dict[SamplingType, tuple[list[int], + list[SequenceGroupToSample]]] MultinomialSamplesType = dict[SamplingType, torch.Tensor] SampleResultsDictType = dict[int, tuple[list[int], list[int]]] @@ -95,8 +90,9 @@ SampleReturnType = tuple[MaybeDeferredSampleResultType, Optional[torch.Tensor]] class SamplerOutput( - msgspec.Struct, omit_defaults=True, array_like=True # type: ignore[call-arg] -): # type: ignore[call-arg] + msgspec.Struct, + omit_defaults=True, # type: ignore[call-arg] + array_like=True): # type: ignore[call-arg] """For each sequence group, we generate a list of SequenceOutput object, each of which contains one possible candidate for the next token. @@ -146,42 +142,43 @@ class SamplerOutput( def __getitem__(self, idx: int) -> CompletionSequenceGroupOutput: """ Returns the output at index `idx` in the sequence group. - + Args: idx (int): Index of the output to retrieve. Must be a valid integer within the range [0, len(self)). - + Raises: IndexError: If `idx` is not a valid integer or is out of bounds. - + Returns: CompletionSequenceGroupOutput: The output at index `idx`. """ return self.outputs[idx] def __setitem__(self, idx: int, value): - """Set the value at the specified index. - If the index is out of range, an IndexError will be raised. - + """ + 设置指定索引处的值。 + 如果索引超出范围,将抛出IndexError异常。 + Args: - idx (int): The index of the output to set. - value (Any): The value to set. - + idx (int): 要设置的索引值。 + value (Any): 要设置的值。 + Raises: - IndexError: If `idx` is out of range. - + IndexError: 当idx超出范围时抛出该异常。 + Returns: - None; + None; 无返回值。 """ self.outputs[idx] = value def __iter__(self) -> Iterator[CompletionSequenceGroupOutput]: """ Returns an iterator over the outputs of this group. - + Args: None - + Returns: Iterator[CompletionSequenceGroupOutput]: An iterator that yields each output in order. """ @@ -190,39 +187,36 @@ class SamplerOutput( def __len__(self): """ Returns the length of the output list. - + Returns: int: The length of the output list. """ return len(self.outputs) def __eq__(self, other: object): - """Check if the current object is equal to another object. - Two objects are considered equal if: 1) they are of the same type; 2) they have the same outputs. - - Args: - other (object): The other object to compare. - - Returns: - bool: If the two objects are equal, return True; otherwise, return False. """ - return isinstance(other, self.__class__) and self.outputs == other.outputs + 判断当前对象是否等于另一个对象。 + 两个对象相等的条件是:1)都是同类型;2)都有相同的 outputs。 + + Args: + other (object): 需要比较的另一个对象。 + + Returns: + bool: 如果两个对象相等,返回 True;否则返回 False。 + """ + return isinstance(other, + self.__class__) and self.outputs == other.outputs def __repr__(self) -> str: - """Show the shape of a tensor instead of its values to reduce noise.""" - sampled_token_probs_repr = ( - "None" - if self.sampled_token_probs is None - else self.sampled_token_probs.shape - ) - sampled_token_ids_repr = ( - "None" if self.sampled_token_ids is None else self.sampled_token_ids.shape - ) - return ( - f"SamplerOutput(outputs={self.outputs}, " - f"sampled_token_probs={sampled_token_probs_repr}, " - f"sampled_token_ids={sampled_token_ids_repr})" - ) + """Show the shape of a tensor instead of its values to reduce noise. + """ + sampled_token_probs_repr = ("None" if self.sampled_token_probs is None + else self.sampled_token_probs.shape) + sampled_token_ids_repr = ("None" if self.sampled_token_ids is None else + self.sampled_token_ids.shape) + return (f"SamplerOutput(outputs={self.outputs}, " + f"sampled_token_probs={sampled_token_probs_repr}, " + f"sampled_token_ids={sampled_token_ids_repr})") class Sampler(nn.Module): @@ -249,13 +243,13 @@ class Sampler(nn.Module): def __init__(self): """ Initializes a SamplerOutput object. - + Args: None. - + Returns: None. - + Raises: None. """ @@ -284,11 +278,9 @@ class Sampler(nn.Module): self._sampling_tensors = None # Initialize new sampling tensors - (sampling_tensors, do_penalties, do_top_p_top_k, do_min_p) = ( - SamplingTensors.from_sampling_metadata( - sampling_metadata, vocab_size, logits.device, logits.dtype - ) - ) + (sampling_tensors, do_penalties, do_top_p_top_k, + do_min_p) = SamplingTensors.from_sampling_metadata( + sampling_metadata, vocab_size, logits.device, logits.dtype) self._sampling_tensors = sampling_tensors self._do_penalties = do_penalties @@ -343,14 +335,11 @@ class Sampler(nn.Module): # Apply presence and frequency penalties. if do_penalties: - logits = apply_penalties( - logits, - sampling_tensors.prompt_tokens, - sampling_tensors.output_tokens, - sampling_tensors.presence_penalties, - sampling_tensors.frequency_penalties, - sampling_tensors.repetition_penalties, - ) + logits = apply_penalties(logits, sampling_tensors.prompt_tokens, + sampling_tensors.output_tokens, + sampling_tensors.presence_penalties, + sampling_tensors.frequency_penalties, + sampling_tensors.repetition_penalties) # Use float32 to apply temperature scaling. # Use in-place division to avoid creating a new tensor. @@ -359,9 +348,8 @@ class Sampler(nn.Module): logits_idx = None if do_top_p_top_k and flashinfer_top_k_top_p_sampling is None: - logits, logits_idx = _apply_top_k_top_p( - logits, sampling_tensors.top_ps, sampling_tensors.top_ks - ) + logits, logits_idx = _apply_top_k_top_p(logits, sampling_tensors.top_ps, + sampling_tensors.top_ks) if do_min_p: logits = _apply_min_p(logits, sampling_tensors.min_ps) @@ -399,10 +387,10 @@ class Sampler(nn.Module): sample_logprobs = None if not sampling_metadata.skip_sampler_cpu_output: # Pythonize logprobs now (GPU -> CPU); do not defer. - assert not isinstance(maybe_deferred_sample_results, SampleResultArgsType) + assert not isinstance(maybe_deferred_sample_results, + SampleResultArgsType) prompt_logprobs, sample_logprobs = get_logprobs( - logprobs, sampling_metadata, maybe_deferred_sample_results - ) + logprobs, sampling_metadata, maybe_deferred_sample_results) return _build_sampler_output( maybe_deferred_sample_results, @@ -410,8 +398,7 @@ class Sampler(nn.Module): prompt_logprobs, sample_logprobs, on_device_tensors=on_device_tensors, - skip_sampler_cpu_output=sampling_metadata.skip_sampler_cpu_output, - ) + skip_sampler_cpu_output=sampling_metadata.skip_sampler_cpu_output) @property def _should_modify_greedy_probs_inplace(self) -> bool: @@ -433,7 +420,7 @@ def _apply_min_tokens_penalty( sampling_metadata: SamplingMetadata, ) -> torch.Tensor: """Apply min_tokens penalty which sets stop tokens to -inf if min_tokens - have not been generated yet + have not been generated yet """ # list of indices in logits that will be set to -inf logits_to_penalize: list[tuple[int, int]] = [] @@ -443,7 +430,8 @@ def _apply_min_tokens_penalty( sampling_params = seq_group.sampling_params sample_indices = seq_group.sample_indices - logits_applied += len(sample_indices) + len(seq_group.prompt_logprob_indices) + logits_applied += len(sample_indices) + len( + seq_group.prompt_logprob_indices) if not seq_group.do_sample: continue @@ -462,8 +450,7 @@ def _apply_min_tokens_penalty( seqs_to_penalize = [start_idx + j for j in seqs_to_penalize] # itertools.product pairs each seq index with every token id logits_to_penalize.extend( - itertools.product(seqs_to_penalize, token_ids_to_penalize) - ) + itertools.product(seqs_to_penalize, token_ids_to_penalize)) if logits_to_penalize: # use zip and * to group indices along each dimension @@ -482,19 +469,19 @@ def _apply_top_k_top_p( ) -> torch.Tensor: """ Applies both top-k and top-p to the given logits. - + Args: logits (torch.Tensor): The input logits of shape [batch_size, sequence_length, num_classes]. p (torch.Tensor): The tensor containing the probability threshold for applying top-p. Must be between 0 and 1. k (torch.Tensor): The tensor containing the number of top elements to keep for applying top-k. Must be less than or equal to num_classes. - + Returns: tuple (torch.Tensor, torch.Tensor): - The modified logits with top-k and top-p applied. Shape is [batch_size, sequence_length, num_classes]. - The indices of the sorted logits before applying top-k and top-p. Shape is [batch_size, sequence_length]. - + Raises: ValueError: If p is not between 0 and 1. ValueError: If k is greater than num_classes. @@ -516,6 +503,10 @@ def _apply_top_k_top_p( top_p_mask[:, -1] = False logits_sort.masked_fill_(top_p_mask, -float("inf")) + # Re-sort the probabilities. + # logits = torch.empty_like(logits_sort).scatter_(dim=-1, + # index=logits_idx, + # src=logits_sort) return logits_sort, logits_idx @@ -562,7 +553,8 @@ def _greedy_sample( seq_ids = seq_group.seq_ids num_parent_seqs = len(seq_ids) - assert num_parent_seqs == 1, "Greedy sampling should have only one seq." + assert num_parent_seqs == 1, ( + "Greedy sampling should have only one seq.") parent_ids = list(range(num_parent_seqs)) next_token_ids = [samples_lst[sample_idx]] results.append((next_token_ids, parent_ids)) @@ -602,13 +594,13 @@ def _random_sample( if is_prompt: # Prompt phase. parent_ids = [0] * sampling_params.n - next_token_ids = random_samples[sample_idx, : sampling_params.n].tolist() + next_token_ids = random_samples[ + sample_idx, :sampling_params.n].tolist() else: # Generation phase. parent_ids = list(range(num_parent_seqs)) - next_token_ids = random_samples[ - sample_idx : sample_idx + num_parent_seqs, 0 - ].tolist() + next_token_ids = random_samples[sample_idx:sample_idx + + num_parent_seqs, 0].tolist() results.append((next_token_ids, parent_ids)) sample_idx += num_parent_seqs return results @@ -628,18 +620,18 @@ def _multinomial( Samples from a multinomial distribution. If `num_samples` is greater than one, the input tensor will be repeated along the first dimension to match the number of samples. The output will have shape (batch size, num samples), where batch size is the length of the input tensor. - + Args: probs (torch.Tensor): A tensor containing the probabilities of each class. Should be broadcastable with the other arguments. num_samples (int): The number of samples to draw from the multinomial distribution. seq_groups (Optional[list[SequenceGroupToSample]], optional): A list of sequence groups that specify how to sample from each group. Defaults to None. - + Returns: torch.Tensor: A tensor containing the indices of the classes sampled from the multinomial distribution. - + Raises: ValueError: If the input tensor has less than two dimensions or if `num_samples` is less than one. @@ -657,37 +649,32 @@ def _multinomial( seq_ids = seq_group.seq_ids stride = len(seq_ids) * num_samples assert seq_group.generator is not None - q[sample_idx : sample_idx + stride].exponential_( - generator=seq_group.generator - ) + q[sample_idx:sample_idx + + stride].exponential_(generator=seq_group.generator) sample_idx += stride return probs.div_(q).argmax(dim=1).view(-1, num_samples) def _top_k_top_p_multinomial_with_flashinfer( - probs: torch.Tensor, - top_ks: torch.Tensor, - top_ps: torch.Tensor, - num_samples: int, - seq_groups: Optional[list[SequenceGroupToSample]], -): + probs: torch.Tensor, top_ks: torch.Tensor, top_ps: torch.Tensor, + num_samples: int, seq_groups: Optional[list[SequenceGroupToSample]]): """ - Use FlashInfer to implement Top-K and Top-P sampling for multiple samples. - - If `seq_groups` is not None, sampling will be performed for each sequence group and the results will be concatenated. - + 使用 FlashInfer 实现多个样本的 Top-K 和 Top-P 采样。 + + 如果 `seq_groups` 不为空,则会对每个序列组进行采样,并将结果拼接起来。 + Args: - probs (torch.Tensor): The input probability distribution, with shape `(batch_size, vocab_size)`. - top_ks (torch.Tensor): The Top-K values for each sample, with shape `(batch_size,)`. - top_ps (torch.Tensor): The Top-P values for each sample, with shape `(batch_size,)`. - num_samples (int): The number of samples to generate. - seq_groups (Optional[list[SequenceGroupToSample]]): Optional, a list containing sequence group information, defaults to None. - SequenceGroupToSample is a class that contains relevant information about a sequence group, including seq_ids and generator (optional). - If seq_groups is not None, sampling will be performed for each sequence group and the results will be concatenated. - + probs (torch.Tensor): 输入概率分布,形状为 `(batch_size, vocab_size)`。 + top_ks (torch.Tensor): 每个样本的 Top-K 值,形状为 `(batch_size,)`。 + top_ps (torch.Tensor): 每个样本的 Top-P 值,形状为 `(batch_size,)`。 + num_samples (int): 生成的样本数量。 + seq_groups (Optional[list[SequenceGroupToSample]]): 可选,包含序列组信息的列表,默认为 None。 + SequenceGroupToSample 是一个类,包含了一个序列组的相关信息,包括 seq_ids、generator(可选)。 + 如果 seq_groups 不为 None,则会对每个序列组进行采样,并将结果拼接起来。 + Returns: - torch.Tensor: A tensor containing the next token IDs, with shape `(batch_size * num_samples,)`. - + torch.Tensor: 返回形状为 `(batch_size * num_samples,)` 的下一个 token ID 的张量。 + Raises: None """ @@ -697,7 +684,8 @@ def _top_k_top_p_multinomial_with_flashinfer( top_ks = top_ks.repeat_interleave(num_samples) top_ps = top_ps.repeat_interleave(num_samples) batch_size = probs.shape[0] - uniform_samples = torch.empty((max_top_k_round, batch_size), device=probs.device) + uniform_samples = torch.empty((max_top_k_round, batch_size), + device=probs.device) if seq_groups is None: uniform_samples.uniform_() else: @@ -706,9 +694,8 @@ def _top_k_top_p_multinomial_with_flashinfer( seq_ids = seq_group.seq_ids stride = len(seq_ids) * num_samples assert seq_group.generator is not None - uniform_samples[:, sample_idx : sample_idx + stride].uniform_( - generator=seq_group.generator - ) + uniform_samples[:, sample_idx:sample_idx + + stride].uniform_(generator=seq_group.generator) sample_idx += stride batch_next_token_ids, success = flashinfer_top_k_top_p_sampling( probs, @@ -717,19 +704,18 @@ def _top_k_top_p_multinomial_with_flashinfer( top_ps, ) if not success.all(): - warnings.warn("FlashInfer rejection sampling failed, fallback.", stacklevel=1) + warnings.warn("FlashInfer rejection sampling failed, fallback.", + stacklevel=1) probs = flashinfer.sampling.top_k_renorm_prob(probs, top_ks) probs = flashinfer.sampling.top_p_renorm_prob(probs, top_ps) batch_next_token_ids = flashinfer.sampling.sampling_from_probs( - probs, uniform_samples[0] - ) + probs, uniform_samples[0]) return batch_next_token_ids.view(-1, num_samples) def get_pythonized_sample_results( - sample_result_args: SampleResultArgsType, -) -> SampleResultType: - """This function consumes GPU-side sampler results and computes + sample_result_args: SampleResultArgsType) -> SampleResultType: + '''This function consumes GPU-side sampler results and computes Pythonized CPU-side sampler results (GPU -> CPU sync.) Single-step scheduling: this function is invoked at sampling-time @@ -743,7 +729,7 @@ def get_pythonized_sample_results( Returns: Pythonized sampler results - """ + ''' ( sample_metadata, @@ -766,9 +752,8 @@ def get_pythonized_sample_results( if sampling_type == SamplingType.GREEDY: sample_results = _greedy_sample(seq_groups, greedy_samples) elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED): - sample_results = _random_sample( - seq_groups, multinomial_samples[sampling_type] - ) + sample_results = _random_sample(seq_groups, + multinomial_samples[sampling_type]) sample_results_dict.update(zip(seq_group_id, sample_results)) return [ @@ -786,7 +771,7 @@ def _sample_with_torch( include_gpu_probs_tensor: bool, modify_greedy_probs: bool, ) -> SampleReturnType: - """Torch-oriented _sample() implementation. + '''Torch-oriented _sample() implementation. Single-step scheduling: * Perform GPU-side sampling computation @@ -796,10 +781,11 @@ def _sample_with_torch( * Perform GPU-side sampling computation * Defer Pythonization & preserve GPU-side tensors required for Pythonization - """ + ''' categorized_seq_group_ids: dict[SamplingType, list[int]] = { - t: [] for t in SamplingType + t: [] + for t in SamplingType } categorized_sample_indices = sampling_metadata.categorized_sample_indices for i, seq_group in enumerate(sampling_metadata.seq_groups): @@ -814,12 +800,10 @@ def _sample_with_torch( # Create output tensor for sampled token ids. if include_gpu_probs_tensor: - sampled_token_ids_tensor = torch.full( - (logprobs.shape[0], 1), - VLLM_INVALID_TOKEN_ID, - dtype=torch.long, - device=logprobs.device, - ) + sampled_token_ids_tensor = torch.full((logprobs.shape[0], 1), + VLLM_INVALID_TOKEN_ID, + dtype=torch.long, + device=logprobs.device) else: sampled_token_ids_tensor = None @@ -836,21 +820,21 @@ def _sample_with_torch( sample_metadata[sampling_type] = (seq_group_id, seq_groups) long_sample_indices = sample_indices.long() if sampling_type == SamplingType.GREEDY: - greedy_samples = torch.argmax(logprobs[long_sample_indices], dim=-1) + greedy_samples = torch.argmax(logprobs[long_sample_indices], + dim=-1) if sampled_token_ids_tensor is not None: # Store sampled tokens in output tensor. - sampled_token_ids_tensor[long_sample_indices] = ( - greedy_samples.unsqueeze(-1) - ) + sampled_token_ids_tensor[ + long_sample_indices] = greedy_samples.unsqueeze(-1) if modify_greedy_probs: # If required, modify the probabilities such that sampling from # the modified distribution would always sample the argmax # token id. - _modify_greedy_probs_inplace( - logprobs, probs, long_sample_indices, greedy_samples - ) + _modify_greedy_probs_inplace(logprobs, probs, + long_sample_indices, + greedy_samples) elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED): max_n_in_batch = 1 @@ -858,39 +842,34 @@ def _sample_with_torch( if seq_group.is_prompt: sampling_params = seq_group.sampling_params max_n_in_batch = max(max_n_in_batch, sampling_params.n) - seq_groups_arg = ( - None if sampling_type == SamplingType.RANDOM else seq_groups - ) + seq_groups_arg = (None if sampling_type == SamplingType.RANDOM else + seq_groups) if flashinfer_top_k_top_p_sampling is not None: - multinomial_samples[sampling_type] = ( - _top_k_top_p_multinomial_with_flashinfer( + multinomial_samples[ + sampling_type] = _top_k_top_p_multinomial_with_flashinfer( probs[long_sample_indices], sampling_tensors.top_ks[long_sample_indices], sampling_tensors.top_ps[long_sample_indices], max_n_in_batch, seq_groups_arg, ) - ) else: result_idx = _multinomial( probs[long_sample_indices], max_n_in_batch, - seq_groups=seq_groups_arg, - ) + seq_groups=seq_groups_arg) if logits_idx is not None: - token_ids = logits_idx[long_sample_indices].gather( - dim=1, index=result_idx.to(logits_idx.device) - ) + # multinomial_samples[sampling_type] = logits_idx[:, result_idx[:][0]] + token_ids = logits_idx[long_sample_indices].gather(dim=1, index=result_idx.to(logits_idx.device)) multinomial_samples[sampling_type] = token_ids else: multinomial_samples[sampling_type] = result_idx if sampled_token_ids_tensor is not None: # Store sampled tokens in output tensor. - sampled_token_ids_tensor[long_sample_indices] = multinomial_samples[ - sampling_type - ].to(torch.long) + sampled_token_ids_tensor[long_sample_indices] = \ + multinomial_samples[sampling_type].to(torch.long) else: raise ValueError(f"Unsupported sampling type: {sampling_type}") @@ -902,17 +881,14 @@ def _sample_with_torch( sample_metadata=sample_metadata, multinomial_samples=multinomial_samples, greedy_samples=greedy_samples, - sample_results_dict=sample_results_dict, - ) + sample_results_dict=sample_results_dict) if not sampling_metadata.skip_sampler_cpu_output: # GPU<->CPU sync happens here. # This also converts the sampler output to a Python object. # Return Pythonized sampler result & sampled token ids - return ( - get_pythonized_sample_results(maybe_deferred_args), - sampled_token_ids_tensor, - ) + return get_pythonized_sample_results( + maybe_deferred_args), sampled_token_ids_tensor else: # Defer sampler result Pythonization; return deferred # Pythonization args & sampled token ids @@ -968,8 +944,9 @@ def _get_ranks(x: torch.Tensor, indices: torch.Tensor) -> torch.Tensor: Each element in the returned tensor represents the rank of the chosen token in the input logprob tensor. """ - vals = x[torch.arange(0, len(x), device=x.device, dtype=indices.dtype), indices] - result = x > vals[:, None] + vals = x[torch.arange(0, len(x), device=x.device, dtype=indices.dtype), + indices] + result = (x > vals[:, None]) del vals return result.sum(1).add_(1) @@ -1017,14 +994,15 @@ def get_logprobs( # Select indices to compute logprob from, ranks of token ids, and the top # k token ids from logprobs. - for seq_group, sample_result in zip(sampling_metadata.seq_groups, sample_results): + for (seq_group, sample_result) in zip(sampling_metadata.seq_groups, + sample_results): sampling_params = seq_group.sampling_params # Update indices and tokens for prompt logprobs. - if seq_group.is_prompt and sampling_params.prompt_logprobs is not None: - largest_num_logprobs = max( - largest_num_logprobs, sampling_params.prompt_logprobs - ) + if (seq_group.is_prompt + and sampling_params.prompt_logprobs is not None): + largest_num_logprobs = max(largest_num_logprobs, + sampling_params.prompt_logprobs) next_prompt_tokens = _get_next_prompt_tokens(seq_group) query_indices.extend(seq_group.prompt_logprob_indices) next_token_ids.extend(next_prompt_tokens) @@ -1038,14 +1016,12 @@ def get_logprobs( # we can obtain it from `sample_result[1]`. query_idx = seq_group.sample_indices[0] query_indices.extend( - [query_idx + parent_id for parent_id in parent_seq_ids] - ) + [query_idx + parent_id for parent_id in parent_seq_ids]) next_token_ids.extend(token_ids) if sampling_params.logprobs is not None: - largest_num_logprobs = max( - largest_num_logprobs, sampling_params.logprobs - ) + largest_num_logprobs = max(largest_num_logprobs, + sampling_params.logprobs) assert len(next_token_ids) == len(query_indices) @@ -1053,9 +1029,8 @@ def get_logprobs( empty_sampled_logprob: SampleLogprobs = [] empty_prompt_logprob: Optional[PromptLogprobs] = None num_seq_groups = len(sampling_metadata.seq_groups) - return [empty_prompt_logprob] * num_seq_groups, [ - empty_sampled_logprob - ] * num_seq_groups + return [empty_prompt_logprob + ] * num_seq_groups, [empty_sampled_logprob] * num_seq_groups selected_logprobs, ranks = None, None top_logprobs, top_token_ids = None, None @@ -1064,16 +1039,15 @@ def get_logprobs( # skip the whole logprob calculation. if largest_num_logprobs >= 0: query_indices_gpu = torch.tensor(query_indices, device=logprobs.device) - next_token_ids_gpu = torch.tensor(next_token_ids, device=logprobs.device) + next_token_ids_gpu = torch.tensor(next_token_ids, + device=logprobs.device) # (num_selected_query_tokens, num_logprobs). Note that query_indices can # contain duplicates if beam search is enabled. - selected_logprobs = logprobs[ - [ - query_indices_gpu, - next_token_ids_gpu, - ] - ] + selected_logprobs = logprobs[[ + query_indices_gpu, + next_token_ids_gpu, + ]] ranks = _get_ranks( logprobs[query_indices_gpu], next_token_ids_gpu, @@ -1084,14 +1058,14 @@ def get_logprobs( if largest_num_logprobs > 0: # Logprobs of topk tokens for a batch of sequence groups. # (num_query_tokens_across_batch). - top_logprobs, top_token_ids = torch.topk( - logprobs, largest_num_logprobs, dim=-1 - ) - top_logprobs = top_logprobs.to("cpu") - top_token_ids = top_token_ids.to("cpu") + top_logprobs, top_token_ids = torch.topk(logprobs, + largest_num_logprobs, + dim=-1) + top_logprobs = top_logprobs.to('cpu') + top_token_ids = top_token_ids.to('cpu') - selected_logprobs = selected_logprobs.to("cpu") - ranks = ranks.to("cpu") + selected_logprobs = selected_logprobs.to('cpu') + ranks = ranks.to('cpu') # Find prompt/sample logprobs. prompt_logprobs_per_seq_group: list[Optional[PromptLogprobs]] = [] @@ -1099,32 +1073,18 @@ def get_logprobs( top_logprob_idx = 0 selected_logprobs_idx = 0 - for seq_group, sample_result in zip(sampling_metadata.seq_groups, sample_results): - (prompt_logprobs, top_logprob_idx, selected_logprobs_idx) = ( - _get_prompt_logprob_if_needed( - seq_group, - selected_logprobs, - ranks, - top_token_ids, - top_logprobs, - selected_logprobs_idx, - top_logprob_idx, - ) - ) + for seq_group, sample_result in zip(sampling_metadata.seq_groups, + sample_results): + (prompt_logprobs, top_logprob_idx, + selected_logprobs_idx) = _get_prompt_logprob_if_needed( + seq_group, selected_logprobs, ranks, top_token_ids, top_logprobs, + selected_logprobs_idx, top_logprob_idx) prompt_logprobs_per_seq_group.append(prompt_logprobs) - (sampled_logprobs, top_logprob_idx, selected_logprobs_idx) = ( - _get_sampled_logprob_if_needed( - seq_group, - sample_result, - selected_logprobs, - ranks, - top_token_ids, - top_logprobs, - selected_logprobs_idx, - top_logprob_idx, - ) - ) + (sampled_logprobs, top_logprob_idx, + selected_logprobs_idx) = _get_sampled_logprob_if_needed( + seq_group, sample_result, selected_logprobs, ranks, top_token_ids, + top_logprobs, selected_logprobs_idx, top_logprob_idx) sample_logprobs_per_seq_group.append(sampled_logprobs) return prompt_logprobs_per_seq_group, sample_logprobs_per_seq_group @@ -1152,11 +1112,10 @@ def _get_prompt_logprob_if_needed( # Pre-select indexes and create a list. It is faster than calling .item # repetitively. selected_logprob_items = selected_logprobs[ - selected_logprobs_idx : selected_logprobs_idx + len(next_prompt_tokens) - ].tolist() - rank_items = ranks[ - selected_logprobs_idx : selected_logprobs_idx + len(next_prompt_tokens) - ].tolist() + selected_logprobs_idx:selected_logprobs_idx + + len(next_prompt_tokens)].tolist() + rank_items = ranks[selected_logprobs_idx:selected_logprobs_idx + + len(next_prompt_tokens)].tolist() for idx, token_id in enumerate(next_prompt_tokens): # Calculate the prompt logprob of the real prompt tokens. @@ -1167,23 +1126,22 @@ def _get_prompt_logprob_if_needed( # Add top K prompt logprobs along with its rank. if num_logprobs > 0: - top_ids = top_token_ids[top_logprob_idx, :num_logprobs].tolist() - top_probs = top_logprobs[top_logprob_idx, :num_logprobs].tolist() + top_ids = top_token_ids[ + top_logprob_idx, :num_logprobs].tolist() + top_probs = top_logprobs[ + top_logprob_idx, :num_logprobs].tolist() # Top K is already sorted by rank, so we can use 1 ~ # num_logprobs + 1 for rank. top_ranks = range(1, num_logprobs + 1) - prompt_logprobs_dict.update( - { - top_id: (top_prob, rank) - for top_id, top_prob, rank in zip(top_ids, top_probs, top_ranks) - } - ) - prompt_logprobs.append( - { - token_id: Logprob(*logprob_and_rank) - for token_id, logprob_and_rank in prompt_logprobs_dict.items() - } - ) + prompt_logprobs_dict.update({ + top_id: (top_prob, rank) + for top_id, top_prob, rank in zip(top_ids, top_probs, + top_ranks) + }) + prompt_logprobs.append({ + token_id: Logprob(*logprob_and_rank) + for token_id, logprob_and_rank in prompt_logprobs_dict.items() + }) # + 1 to go to the next prompt token. top_logprob_idx += 1 @@ -1218,44 +1176,37 @@ def _get_sampled_logprob_if_needed( # Pre-select items from tensor. tolist() is faster than repetitive # `.item()` calls. selected_logprob_items = selected_logprobs[ - selected_logprobs_idx : selected_logprobs_idx + len(next_token_ids) - ].tolist() - rank_items = ranks[ - selected_logprobs_idx : selected_logprobs_idx + len(next_token_ids) - ].tolist() + selected_logprobs_idx:selected_logprobs_idx + + len(next_token_ids)].tolist() + rank_items = ranks[selected_logprobs_idx:selected_logprobs_idx + + len(next_token_ids)].tolist() for idx, (next_token_id, parent_id) in enumerate( - zip(next_token_ids, parent_seq_ids) - ): + zip(next_token_ids, parent_seq_ids)): # Get the logprob of a sampled token. sampled_logprobs_dict = { - next_token_id: (selected_logprob_items[idx], rank_items[idx]) + next_token_id: + (selected_logprob_items[idx], rank_items[idx]) } if num_logprobs is not None and num_logprobs > 0: # Get top K logprobs. - top_ids = top_token_ids[ - top_logprob_idx + parent_id, :num_logprobs - ].tolist() + top_ids = top_token_ids[top_logprob_idx + + parent_id, :num_logprobs].tolist() top_probs = top_logprobs[ - top_logprob_idx + parent_id, :num_logprobs - ].tolist() + top_logprob_idx + parent_id, :num_logprobs].tolist() # Top K is already sorted by rank, so we can use 1 ~ # num_logprobs + 1 for rank. top_ranks = range(1, num_logprobs + 1) - sampled_logprobs_dict.update( - { - top_id: (top_prob, rank) - for top_id, top_prob, rank in zip( - top_ids, top_probs, top_ranks - ) - } - ) + sampled_logprobs_dict.update({ + top_id: (top_prob, rank) + for top_id, top_prob, rank in zip( + top_ids, top_probs, top_ranks) + }) - sampled_logprobs.append( - { - token_id: Logprob(*logprob_and_rank) - for token_id, logprob_and_rank in sampled_logprobs_dict.items() - } - ) + sampled_logprobs.append({ + token_id: Logprob(*logprob_and_rank) + for token_id, logprob_and_rank in + sampled_logprobs_dict.items() + }) # NOTE: This part of code is not intuitive. `selected_logprobs` include # logprobs for the current step, which has len(next_token_ids) tokens @@ -1269,12 +1220,9 @@ def _get_sampled_logprob_if_needed( return sampled_logprobs, top_logprob_idx, selected_logprobs_idx -def _modify_greedy_probs_inplace( - logprobs: torch.Tensor, - probs: torch.Tensor, - sample_indices: torch.Tensor, - greedy_samples: torch.Tensor, -) -> None: +def _modify_greedy_probs_inplace(logprobs: torch.Tensor, probs: torch.Tensor, + sample_indices: torch.Tensor, + greedy_samples: torch.Tensor) -> None: """Modify the probability distributions of the greedily-sampled tokens such that each sampled token has a "probability" of 1.0. This is required by speculative decoding, which depends on the sampling method being encoded @@ -1327,7 +1275,8 @@ def _build_sampler_output( sampling_metadata: SamplingMetadata, prompt_logprobs: Optional[list[Optional[PromptLogprobs]]], sample_logprobs: Optional[list[SampleLogprobs]], - on_device_tensors: Optional[tuple[torch.Tensor, torch.Tensor, torch.Tensor]], + on_device_tensors: Optional[tuple[torch.Tensor, torch.Tensor, + torch.Tensor]], skip_sampler_cpu_output: bool = False, ) -> SamplerOutput: """Construct Python objects with the output of sampling. @@ -1346,55 +1295,48 @@ def _build_sampler_output( else: assert prompt_logprobs is not None assert sample_logprobs is not None - assert not isinstance(maybe_deferred_sample_results, SampleResultArgsType) - assert ( - len(sampling_metadata.seq_groups) - == len(maybe_deferred_sample_results) - == len(prompt_logprobs) + assert not isinstance(maybe_deferred_sample_results, + SampleResultArgsType) + assert len(sampling_metadata.seq_groups) \ + == len(maybe_deferred_sample_results) \ + == len(prompt_logprobs) \ == len(sample_logprobs) - ) deferred_sample_results_args = None - for ( - seq_group, - sample_result, - group_prompt_logprobs, - group_sample_logprobs, - ) in zip( - sampling_metadata.seq_groups, - maybe_deferred_sample_results, - prompt_logprobs, - sample_logprobs, - ): + for (seq_group, sample_result, group_prompt_logprobs, + group_sample_logprobs) in zip(sampling_metadata.seq_groups, + maybe_deferred_sample_results, + prompt_logprobs, sample_logprobs): seq_ids = seq_group.seq_ids next_token_ids, parent_ids = sample_result seq_outputs: list[SequenceOutput] = [] for parent_id, next_token_id, logprobs in zip( - parent_ids, next_token_ids, group_sample_logprobs - ): + parent_ids, next_token_ids, group_sample_logprobs): seq_outputs.append( - SequenceOutput(seq_ids[parent_id], next_token_id, logprobs) - ) + SequenceOutput(seq_ids[parent_id], next_token_id, + logprobs)) sampler_output.append( - CompletionSequenceGroupOutput(seq_outputs, group_prompt_logprobs) - ) + CompletionSequenceGroupOutput(seq_outputs, + group_prompt_logprobs)) # If not specified, store None values in SamplerOutput. if on_device_tensors is not None: - (sampled_token_probs, logprobs_tensor, sampled_token_ids) = on_device_tensors + (sampled_token_probs, logprobs_tensor, + sampled_token_ids) = on_device_tensors else: - sampled_token_probs, logprobs_tensor, sampled_token_ids = (None, None, None) + sampled_token_probs, logprobs_tensor, sampled_token_ids = (None, None, + None) return SamplerOutput( outputs=sampler_output, sampled_token_probs=sampled_token_probs, sampled_token_ids=sampled_token_ids, logprobs=logprobs_tensor, - deferred_sample_results_args=deferred_sample_results_args, - ) + deferred_sample_results_args=deferred_sample_results_args) -def _get_next_prompt_tokens(seq_group: SequenceGroupToSample) -> tuple[int, ...]: +def _get_next_prompt_tokens( + seq_group: SequenceGroupToSample) -> tuple[int, ...]: """Get a list of next prompt tokens to compute logprob from a given sequence group. @@ -1408,9 +1350,8 @@ def _get_next_prompt_tokens(seq_group: SequenceGroupToSample) -> tuple[int, ...] Returns: A list of next prompt tokens to compute logprob. """ - assert ( - seq_group.is_prompt - ), "Caller should ensure the sequence group is in a prefill stage." + assert seq_group.is_prompt, ( + "Caller should ensure the sequence group is in a prefill stage.") seq_ids = seq_group.seq_ids query_len = seq_group.query_len assert query_len is not None @@ -1421,6 +1362,8 @@ def _get_next_prompt_tokens(seq_group: SequenceGroupToSample) -> tuple[int, ...] prompt_tokens = seq_data.prompt_token_ids # +1 because we are looking for a next prompt token. next_token_index_start = computed_len + 1 - next_token_index_end = min(computed_len + query_len + 1, len(prompt_tokens)) - next_prompt_tokens = prompt_tokens[next_token_index_start:next_token_index_end] + next_token_index_end = min(computed_len + query_len + 1, + len(prompt_tokens)) + next_prompt_tokens = prompt_tokens[ + next_token_index_start:next_token_index_end] return next_prompt_tokens diff --git a/vllm_kunlun/patches/eval_frame.py b/vllm_kunlun/patches/eval_frame.py index 91eac70..9f22c8e 100644 --- a/vllm_kunlun/patches/eval_frame.py +++ b/vllm_kunlun/patches/eval_frame.py @@ -98,12 +98,10 @@ cached_backends: Dict[int, CompilerFn] = {} unset = Unset.token from torch._C._dynamo.eval_frame import set_eval_frame - - def _maybe_set_eval_frame(callback: DynamoCallback): # A wrapper on set_eval_frame that is guarded by a Justknob. # Users can disable torchDynamo by setting the JK to False. - # from torch._C._dynamo.eval_frame import set_eval_frame + #from torch._C._dynamo.eval_frame import set_eval_frame if not justknobs_check("pytorch/compiler:enable_compiler_set_eval_frame"): torch._dynamo.utils.warn_once( @@ -130,7 +128,7 @@ DONT_WRAP_FILES = { def _debug_get_cache_entry_list( - code: Union[types.CodeType, Callable[..., Any]], + code: Union[types.CodeType, Callable[..., Any]] ) -> List[CacheEntry]: """ Given a code object or a callable object, retrieve the cache entries @@ -373,9 +371,9 @@ class _TorchDynamoContext: # add context containing GraphModule to any GraphModule forward functions if isinstance(fn, GraphModule): # add context containing GraphModule to any GraphModule forward functions - code_context.get_context(fn.forward.__code__)["orig_graphmodule"] = ( - weakref.ref(fn) - ) + code_context.get_context(fn.forward.__code__)[ + "orig_graphmodule" + ] = weakref.ref(fn) # Optimize the forward method of torch.nn.Module object if isinstance(fn, torch.nn.Module): @@ -789,11 +787,9 @@ def _optimize( hooks, backend_ctx_ctor, dynamic=dynamic, - compiler_config=( - backend.get_compiler_config() - if hasattr(backend, "get_compiler_config") - else None - ), + compiler_config=backend.get_compiler_config() + if hasattr(backend, "get_compiler_config") + else None, rebuild_ctx=rebuild_ctx, ) @@ -907,11 +903,9 @@ class FlattenInputOutputSignature(torch.fx.interpreter.Transformer): flat_args[i], symbolic_context=StatelessSymbolicContext( dynamic_sizes=[ - ( - DimDynamic.DYNAMIC - if d in flat_args_dynamic_dims[i] - else DimDynamic.STATIC - ) + DimDynamic.DYNAMIC + if d in flat_args_dynamic_dims[i] + else DimDynamic.STATIC for d in range(len(flat_args[i].shape)) ], constraint_sizes=[None] * len(flat_args[i].shape), diff --git a/vllm_kunlun/platforms/envs.py b/vllm_kunlun/platforms/envs.py index 2fde137..8993704 100644 --- a/vllm_kunlun/platforms/envs.py +++ b/vllm_kunlun/platforms/envs.py @@ -4,28 +4,26 @@ import os from typing import TYPE_CHECKING, Any, Callable, Optional if TYPE_CHECKING: - VLLM_MULTI_LOGPATH: str = ("./log",) - ENABLE_VLLM_MULTI_LOG: bool = (False,) - ENABLE_VLLM_INFER_HOOK: bool = (False,) - ENABLE_VLLM_OPS_HOOK: bool = (False,) - ENABLE_VLLM_MODULE_HOOK: bool = False - + VLLM_MULTI_LOGPATH : str = "./log", + ENABLE_VLLM_MULTI_LOG : bool = False, + ENABLE_VLLM_INFER_HOOK : bool = False, + ENABLE_VLLM_OPS_HOOK : bool = False, + ENABLE_VLLM_MODULE_HOOK : bool = False def maybe_convert_int(value: Optional[str]) -> Optional[int]: """ - If the value is None, return None; otherwise, convert the string to an integer and return it. - + 如果值是None,则返回None;否则将字符串转换为整数并返回。 + Args: - value (Optional[str], optional): The optional string to convert. Defaults to None. - + value (Optional[str], optional): 要转换的可选字符串. Defaults to None. + Returns: - Optional[int]: If the value is None, return None; otherwise, convert the string to an integer and return it. + Optional[int]: 如果值是None,则返回None;否则将字符串转换为整数并返回. """ if value is None: return None return int(value) - # The begin-* and end* here are used by the documentation generator # to extract the used env vars. @@ -33,56 +31,59 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]: xvllm_environment_variables: dict[str, Callable[[], Any]] = { # path to the logs of redirect-output, abstrac of related are ok - "VLLM_MULTI_LOGPATH": lambda: os.environ.get("VLLM_MULTI_LOGPATH", "./logs"), - # turn on / off multi-log of multi nodes & multi cards - "ENABLE_VLLM_MULTI_LOG": lambda: ( - os.environ.get("ENABLE_VLLM_MULTI_LOG", "False").lower() in ("true", "1") - ), - # turn on / off XVLLM infer stage log ability - "ENABLE_VLLM_INFER_HOOK": lambda: ( - os.environ.get("ENABLE_VLLM_INFER_HOOK", "False").lower() in ("true", "1") - ), - # turn on / off XVLLM infer_ops log ability - "ENABLE_VLLM_OPS_HOOK": lambda: ( - os.environ.get("ENABLE_VLLM_OPS_HOOK", "False").lower() in ("true", "1") - ), - "ENABLE_VLLM_MODULE_HOOK": lambda: ( - os.environ.get("ENABLE_VLLM_MODULE_HOOK", "False").lower() in ("true", "1") - ), + "VLLM_MULTI_LOGPATH": + lambda: os.environ.get("VLLM_MULTI_LOGPATH", "./logs"), + + # turn on / off multi-log of multi nodes & multi cards + "ENABLE_VLLM_MULTI_LOG": + lambda: (os.environ.get("ENABLE_VLLM_MULTI_LOG", "False").lower() in + ("true", "1")), + + # turn on / off XVLLM infer stage log ability + "ENABLE_VLLM_INFER_HOOK": + lambda: (os.environ.get("ENABLE_VLLM_INFER_HOOK", "False").lower() in + ("true", "1")), + + # turn on / off XVLLM infer_ops log ability + "ENABLE_VLLM_OPS_HOOK": + lambda: (os.environ.get("ENABLE_VLLM_OPS_HOOK", "False").lower() in + ("true", "1")), + + "ENABLE_VLLM_MODULE_HOOK": + lambda: (os.environ.get("ENABLE_VLLM_MODULE_HOOK", "False").lower() in + ("true", "1")), + # fuse sorted op with fused_moe kernel - "ENABLE_VLLM_MOE_FC_SORTED": lambda: ( - os.environ.get("ENABLE_VLLM_MOE_FC_SORTED", "False").lower() in ("true", "1") - ), + "ENABLE_VLLM_MOE_FC_SORTED": + lambda: (os.environ.get("ENABLE_VLLM_MOE_FC_SORTED", "False").lower() in + ("true", "1")), + # enable custom dpsk scaling rope - "ENABLE_CUSTOM_DPSK_SCALING_ROPE": lambda: ( - os.environ.get("ENABLE_CUSTOM_DPSK_SCALING_ROPE", "False").lower() - in ("true", "1") - ), + "ENABLE_CUSTOM_DPSK_SCALING_ROPE": + lambda: (os.environ.get("ENABLE_CUSTOM_DPSK_SCALING_ROPE", "False").lower() in + ("true", "1")), + # fuse qkv split & qk norm & qk rope # only works for qwen3 dense and qwen3 moe models - "ENABLE_VLLM_FUSED_QKV_SPLIT_NORM_ROPE": lambda: ( - os.environ.get("ENABLE_VLLM_FUSED_QKV_SPLIT_NORM_ROPE", "False").lower() - in ("true", "1") - ), + "ENABLE_VLLM_FUSED_QKV_SPLIT_NORM_ROPE": + lambda: (os.environ.get("ENABLE_VLLM_FUSED_QKV_SPLIT_NORM_ROPE", "False").lower() in + ("true", "1")), } # end-env-vars-definition - def __getattr__(name: str): """ - This function is called when an attribute that doesn't exist is accessed. - If the attribute is one of the xvllm_environment_variables, return the corresponding value. - Otherwise, raise an AttributeError. - + 当调用不存在的属性时,该函数被调用。如果属性是xvllm_environment_variables中的一个,则返回相应的值。否则引发AttributeError异常。 + Args: - name (str): The name of the attribute to retrieve. - + name (str): 要获取的属性名称。 + Raises: - AttributeError (Exception): If the attribute is not one of xvllm_environment_variables, this exception is raised. - + AttributeError (Exception): 如果属性不是xvllm_environment_variables中的一个,则会引发此异常。 + Returns: - Any, optional: If the attribute is one of xvllm_environment_variables, the corresponding value is returned; otherwise, None is returned. + Any, optional: 如果属性是xvllm_environment_variables中的一个,则返回相应的值;否则返回None。 """ # lazy evaluation of environment variables if name in xvllm_environment_variables: @@ -92,14 +93,13 @@ def __getattr__(name: str): def __dir__(): """ - Returns a list of all visible variable names. - + 返回一个包含所有可见的变量名称的列表。 + + 返回值(list):一个包含所有可见的变量名称的列表,这些变量是通过`xvllm_environment_variables`字典定义的。 + Returns: - list: A list of all visible variable names, which are defined through the `xvllm_environment_variables` dictionary. - - Returns: - List[str]: A list of all visible variable names. - These variables are defined through the `xvllm_environment_variables` dictionary. + List[str]: 一个包含所有可见的变量名称的列表。 + 这些变量是通过`xvllm_environment_variables`字典定义的。 """ return list(xvllm_environment_variables.keys()) diff --git a/vllm_kunlun/platforms/kunlun.py b/vllm_kunlun/platforms/kunlun.py index 0332c32..d52f294 100644 --- a/vllm_kunlun/platforms/kunlun.py +++ b/vllm_kunlun/platforms/kunlun.py @@ -19,10 +19,11 @@ class KunlunPlatform(Platform): @property def device_type(self): - """Returns the device type, which is fixed as 'cuda'. + """ + 返回设备类型,固定为'cuda'。 """ return "cuda" - + def is_kunlun(self) -> bool: """is_kunlun""" return self._enum == PlatformEnum.CUDA @@ -69,13 +70,14 @@ class KunlunPlatform(Platform): @classmethod def get_device_name(cls, device_id: int = 0) -> str: - """Returns the device name, which defaults to "kunlun". - + """ + 获取设备名称,默认返回 "kunlun"。 + Args: - device_id (int, optional): The device ID, default is 0. Ignored in this method. Defaults to 0. - + device_id (int, optional): 设备ID,默认为0. Ignored in this method. Defaults to 0. + Returns: - str: The device name, which is fixed as "kunlun". + str: 设备名称,固定返回 "kunlun". """ return "kunlun" @@ -89,23 +91,26 @@ class KunlunPlatform(Platform): @classmethod def get_device_total_memory(cls, device_id: int = 0) -> int: - """Returns the total memory size of the device in bytes (B). Defaults to the total memory size of the first device. - If the `device_id` parameter is not an integer or exceeds the available device range, a ValueError will be raised. - + """ + 获取设备总内存大小,单位为字节(B)。默认返回第一个设备的总内存大小。 + 如果传入参数`device_id`不是整数或者超出了可用设备范围,将会引发ValueError异常。 + Args: - device_id (int, optional): The device ID, default is 0. Defaults to 0. - + device_id (int, optional): 设备ID,默认为0. Defaults to 0. + Raises: - ValueError: If the `device_id` parameter is not an integer or exceeds the available device range, this exception is raised. - + ValueError: 当传入的`device_id`不是整数或者超出了可用设备范围时引发此异常。 + Returns: - int: The total memory size of the device in bytes (B). + int: 设备总内存大小,单位为字节(B)。 """ return psutil.virtual_memory().total @classmethod def inference_mode(cls): - """Returns a context manager that disables gradient computation. + """ + 进入推理模式,禁止计算梯度。 + 返回:torch.no_grad(),一个上下文管理器,用于禁止计算梯度。 """ return torch.no_grad() @@ -114,29 +119,31 @@ class KunlunPlatform(Platform): """get_device_capability""" major, minor = torch.cuda.get_device_capability() return DeviceCapability(major=major, minor=minor) + @classmethod def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: - """Updates the default values of various components based on the configuration. - If not specified, automatically selects the worker class based on certain conditions. - If the block size is not set in the cache configuration, it is set to 16. - If using MLA and `VLLM_ATTENTION_BACKEND` is not set or is set to "FLASHMLA", - the cache block size is set to 64. - If running in DeepEP high throughput backend, data parallelism greater than 1, and CUDA graph mode, - it forces the use of eager mode, as DP + DeepEP high throughput kernels are not CUDA graph compatible, - and using DeepEP low latency kernels can resolve this issue. - + """ + 根据配置更新各个部分的默认值。 + 如果未指定,则根据某些条件自动选择worker类。 + 如果缓存配置中没有设置块大小,则将其设置为16。 + 如果使用MLA,并且`VLLM_ATTENTION_BACKEND`未设置或设置为"FLASHMLA", + 则将缓存块大小设置为64。 + 如果在DeepEP高吞吐量后端、数据并行大于1和CUDA图形模式下运行,则强制 + 强制执行即时模式,因为DP + DeepEP高吞吐量内核不是CUDA图形兼容的,而且 + 使用DeepEP低延迟内核可以解决这个问题。 + Args: - vllm_config (VllmConfig): VLLM configuration object. - + vllm_config (VllmConfig): VLLM配置对象。 + Raises: - NotImplementedError: If multi-step scheduling is used on vLLM V1, this exception is raised. - Please remove the --num-scheduler-steps argument from the command line. - NotImplementedError: If MLA is used on vLLM V1, this exception is raised. - Please ensure that the `VLLM_ATTENTION_BACKEND` environment variable is set before using MLA. - + NotImplementedError: 如果在vLLM V1上使用多步调度,则会引发NotImplementedError。 + 请从命令行中删除--num-scheduler-steps参数。 + NotImplementedError: 如果在vLLM V1上使用MLA,则会引发NotImplementedError。 + 请确保在使用MLA之前设置了`VLLM_ATTENTION_BACKEND`环境变量。 + Returns: - None: No return value. + None: 无返回值。 """ parallel_config = vllm_config.parallel_config scheduler_config = vllm_config.scheduler_config @@ -159,7 +166,7 @@ class KunlunPlatform(Platform): "vllm.v1.worker.gpu_worker.Worker" else: parallel_config.worker_cls = "vllm.worker.worker.Worker" - + cache_config = vllm_config.cache_config if cache_config and cache_config.block_size is None: cache_config.block_size = 16 @@ -198,9 +205,10 @@ class KunlunPlatform(Platform): vllm_config.compilation_config.pass_config.enable_fusion = False vllm_config.compilation_config.use_inductor = False + @classmethod def get_attn_backend_cls(cls, selected_backend, head_size, dtype, - kv_cache_dtype, block_size, use_v1, use_mla,use_sink): + kv_cache_dtype, block_size, use_v1, use_mla,use_sink, use_sparse=False): """ Returns the class of attention backend based on the selected backend and other parameters. @@ -227,15 +235,16 @@ class KunlunPlatform(Platform): def get_current_memory_usage(cls, device: Optional[torch.types.Device] = None ) -> float: - """Gets the current memory usage of the device, including allocated and max allocated. - If no device is specified, defaults to the current context's device. - - Args: - device (Optional[torch.types.Device], optional): Optional device object, defaults to None. Defaults to the current context's device. - - Returns: - float: Returns a float representing the current memory usage of the device, in bytes. - + """ + 获取当前设备的内存使用情况,包括已分配和最大分配。 + 如果未指定设备,则默认为当前上下文中的设备。 + + Args: + device (Optional[torch.types.Device], optional): 可选的设备对象,默认为None。默认为当前上下文中的设备。 + + Returns: + float: 返回一个浮点数,表示当前设备的内存使用情况,单位是字节(bytes)。 + Raises: None. """ @@ -244,17 +253,18 @@ class KunlunPlatform(Platform): @classmethod def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: - """Checks if asynchronous output is supported. - By default, Kunlun does not support asynchronous output. - - Args: - enforce_eager (Optional[bool], optional): Whether to enforce eager execution. Defaults to None. - None means not to force eager execution, but to automatically select based on the current environment. - - Returns: - bool: True means asynchronous output is supported, False means asynchronous output is not supported. """ - # Assume Kunlun does not support asynchronous output + 判断是否支持异步输出。 + 默认情况下,Kunlun 不支持异步输出。 + + Args: + enforce_eager (Optional[bool], optional): 是否强制使用 eager execution. Defaults to None. + None 表示不强制使用 eager execution,而是根据当前环境自动选择。 + + Returns: + bool: True 表示支持异步输出,False 表示不支持异步输出。 + """ + # 假设 Kunlun 不支持异步输出 return False @classmethod @@ -279,11 +289,42 @@ class KunlunPlatform(Platform): @classmethod def get_device_communicator_cls(cls) -> str: - ''' + ''' communicator ''' - return "vllm_kunlun.distributed.kunlun_communicator.KunlunCommunicator" + return "vllm_kunlun.distributed.kunlun_communicator.KunlunCommunicator" @classmethod def get_punica_wrapper(cls): - return "vllm_kunlun.lora.punica_wrapper.punica_kunlun.PunicaWrapperKunlun" + return "vllm.lora.punica_wrapper.punica_cpu.PunicaWrapperCPU" + + @classmethod + def check_if_supports_dtype(cls, torch_dtype: torch.dtype): + ''' + Kunlun3平台支持的数据类型 + ''' + supported_dtypes = { + torch.float32, + torch.float16, + torch.bfloat16, + torch.int8, + } + if torch_dtype not in supported_dtypes: + raise ValueError( + f"Kunlun platform does not support dtype {torch_dtype}. " + "Supported dtypes are: fp32, fp16, bf16, int8." + ) + + def opaque_attention_op(cls) -> bool: + ''' + 确保V1 Graph在Kunlun3平台使用vllm.unified_attention_with_output_kunlun作为split ops + ''' + return True + + @classmethod + def support_hybrid_kv_cache(cls) -> bool: + return True + + @classmethod + def support_static_graph_mode(cls) -> bool: + return True \ No newline at end of file diff --git a/vllm_kunlun/utils.py b/vllm_kunlun/utils.py index 6bc6778..73894c2 100644 --- a/vllm_kunlun/utils.py +++ b/vllm_kunlun/utils.py @@ -1,61 +1,28 @@ -# -# Copyright (c) 2025 Baidu, Inc. All Rights Reserved. -# This file is a part of the vllm-kunlun project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - import os, sys import vllm from torch.utils._python_dispatch import TorchDispatchMode -import vllm_kunlun.platforms.envs as xenvs +import vllm_kunlun.platforms.envs as xenvs from vllm.utils import weak_ref_tensor -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Generic, - Literal, - NamedTuple, - Optional, - Tuple, - TypeVar, - Union, - cast, - overload, - get_origin, - get_args, - List, -) +from typing import (TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple, + Optional, Tuple, TypeVar, Union, cast, overload, + get_origin, get_args, List) import torch from torch.library import Library import inspect import typing - - def redirect_output(): """ - Redirect output to a specified directory and name the log files as pp=0_rank=X or pp=1_rank=X. - If it is the first process of the first process group, use pp=0; otherwise, use pp=1. - + 重定向输出到指定目录,并将日志文件命名为pp=0_rank=X或pp=1_rank=X。 + 如果是第一个进程组的第一个进程,则使用pp=0;否则使用pp=1。 + Args: - No parameters. - + 无参数。 + Returns: - No return value, directly modify the file descriptors of sys.stdout and sys.stderr. + 无返回值,直接修改sys.stdout和sys.stderr的文件描述符。 """ from vllm.distributed import get_tensor_model_parallel_rank, get_pp_group - rank = get_tensor_model_parallel_rank() dir_path = xenvs.VLLM_MULTI_LOGPATH os.makedirs(dir_path, exist_ok=True) @@ -63,54 +30,48 @@ def redirect_output(): log_file = os.path.join(dir_path, f"pp=0_rank={rank}.log") else: log_file = os.path.join(dir_path, f"pp=1_rank={rank}.log") - fd = os.open(log_file, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o644) + fd = os.open(log_file, os.O_WRONLY | os.O_CREAT| os.O_TRUNC, 0o644) os.dup2(fd, sys.stdout.fileno()) os.dup2(fd, sys.stderr.fileno()) os.close(fd) - def multi_log_monkey_patch(func): """ - Monkey patch function for logging multiple times, used to test log redirection functionality. - This function will print a log message each time the patched function is called. - + 多次打印日志的猴子补丁函数,用于测试日志重定向功能。 + 该函数会在每次调用被补丁的函数时打印一条日志信息。 + Args: - func (function): The original function to be patched. - + func (function): 需要被补丁的原始函数。 + Returns: - function: A wrapped new function that prints a log message each time it is called. + function: 返回一个包装后的新函数,每次调用都会打印一条日志信息。 """ - def wrapper(*args, **kwargs): print("[monkey patch] ensure_model_parallel_initialized") func(*args, **kwargs) redirect_output() - return wrapper - +# if os.environ.get("VLLM_MULTI_LOG", "0") == "1": if xenvs.ENABLE_VLLM_MULTI_LOG: print("ENABLE_VLLM_MULTI_LOG monkey--------") vllm.distributed.ensure_model_parallel_initialized = multi_log_monkey_patch( - vllm.distributed.ensure_model_parallel_initialized - ) - + vllm.distributed.ensure_model_parallel_initialized) class StageHookPre(object): def __call__(self, *args, **kwargs): """ - This method will be automatically executed when the object is called. - If the current attention metadata is not None and a token has been processed, print "Per Token Start"; otherwise, print "First Token Start". - + 在调用对象时,会自动执行此方法。 + 如果当前的attention metadata不为None,并且已经处理了一个token,则打印"Per Token Start";否则打印"First Token Start"。 + Args: - args (tuple, optional): Variable length argument list, default is an empty tuple. - kwargs (dict, optional): Keyword arguments, default is an empty dictionary. - + args (tuple, optional): 可变参数,默认为空元组。 + kwargs (dict, optional): 关键字参数,默认为空字典。 + Returns: - None: No return value. + None: 无返回值。 """ from vllm.forward_context import get_forward_context - attn_metadata = get_forward_context().attn_metadata if attn_metadata is not None: if attn_metadata.num_decode_tokens == 0: @@ -118,22 +79,20 @@ class StageHookPre(object): else: print("Per Token Start", flush=True) - class StageHookPost(object): def __call__(self, *args, **kwargs): """ - If the current context's attention metadata is not None and num_decode_tokens equals 0, print "First Token End". - Otherwise, print "Per Token End". - + 如果当前上下文中的attention metadata不为None,并且num_decode_tokens等于0,则打印"First Token End"。 + 否则,打印"Per Token End"。 + Args: - args (Tuple[Any]): Variable length argument list, unused parameters are passed in. - kwargs (Dict[str, Any]): Keyword arguments, unused parameters are passed in. - + args (Tuple[Any]): 可变长度参数列表,无用参数传入。 + kwargs (Dict[str, Any]): 字典类型的关键字参数,无用参数传入。 + Returns: - None: No return value. + None: 该函数没有返回值。 """ from vllm.forward_context import get_forward_context - attn_metadata = get_forward_context().attn_metadata if attn_metadata is not None: if attn_metadata.num_decode_tokens == 0: @@ -145,24 +104,22 @@ class StageHookPost(object): class ModuleLoggingHookPre(object): def __init__(self): """ - Initialization function to initialize the indentation list and name list. - The indentation list is used to store the indentation information of each line, - and the name list is used to store the name of each variable or function. + 初始化函数,用于初始化缩进列表和名称列表。 + 缩进列表用于存储每一行的缩进信息,名称列表用于存储每一个变量或函数的名称。 """ self.indent_list = list() self.indent_list.append("") self.name_list = list() - def __call__(self, *args, **kwargs): """ - This method overrides the __call__ method and is used when the class is instantiated. - It increases the current indentation by one Tab and records the current class name. - It prints the start information, flush=True means it will be output to the console immediately. - + 重写了 __call__ 方法,用于在类实例化时调用。 + 将当前缩进增加一个 Tab,并记录当前类名称。 + 打印开始信息,flush=True 表示立即输出到控制台。 + Args: - args (tuple): Variable length argument list, default is an empty tuple. - kwargs (dict): Keyword arguments, default is an empty dictionary. - + args (tuple): 传入的参数列表,第一个元素是类实例。 + kwargs (dict): 传入的关键字参数列表,不使用。 + Returns: None. """ @@ -174,70 +131,62 @@ class ModuleLoggingHookPre(object): class ModuleLoggingHookPost(object): def __init__(self, indent_list, name_list): """ - Initialization function to set the indentation list and name list. - + 初始化函数,设置缩进列表和名称列表。 + Args: - indent_list (List[str]): A list of indentation strings for each node, indexed from 0. - name_list (List[str]): A list of name strings for each node, indexed from 0. - Note: The indentation list and name list should have the same length, otherwise it will cause an error. - + indent_list (List[str]): 包含每个节点的缩进字符串的列表,索引从0开始。 + name_list (List[str]): 包含每个节点的名称字符串的列表,索引从0开始。 + 注意:缩进列表和名称列表应该有相同长度,否则会导致错误。 + Returns: - None: No return value, directly modifies the instance's attributes. + None. 无返回值,直接修改了类实例的属性。 """ self.indent_list = indent_list self.name_list = name_list def __call__(self, *args, **kwargs): """ - This method is called when the object is invoked. - Args: - *args, **kwargs: Variable length argument list and keyword argument dictionary, unused. - Returns: - None: No return value. + 当调用对象时,输出模块结束信息。 + 参数:*args、**kwargs - 可变长度的位置参数列表和关键字参数字典,未使用。 + 返回值:None,无返回值。 """ print(self.indent_list[-1] + self.name_list[-1] + " Module End", flush=True) self.indent_list.pop() self.name_list.pop() - +# if os.environ.get("ENABLE_VLLM_MODULE_HOOK", "0") == "1": if xenvs.ENABLE_VLLM_MODULE_HOOK: - from torch.nn.modules.module import ( - register_module_forward_pre_hook, - register_module_forward_hook, - ) - + from torch.nn.modules.module import register_module_forward_pre_hook, register_module_forward_hook module_logging_hook_pre = ModuleLoggingHookPre() module_logging_hook_post = ModuleLoggingHookPost( - module_logging_hook_pre.indent_list, module_logging_hook_pre.name_list - ) + module_logging_hook_pre.indent_list, module_logging_hook_pre.name_list) register_module_forward_pre_hook(module_logging_hook_pre) register_module_forward_hook(module_logging_hook_post) else: module_logging_hook_pre = None module_logging_hook_post = None - class LoggingDispatchMode(TorchDispatchMode): def __init__(self): """ - Initialization function to initialize the attributes and methods of the class. - Some initialization operations can be performed here, such as setting default values. + 初始化函数,用于初始化类的属性和方法。 + 在此处可以进行一些初始化操作,例如设置默认值等。 """ super().__init__() - + def __torch_dispatch__(self, func, types, args=(), kwargs=None): """ Override the default dispatch behavior of torch.nn.Module. This function will be called before and after each method call on this module. It can be used to log information about the method calls. - + Args: func (function): The function that is being called on this module. types (Tuple[str]): A tuple of strings representing the type signatures of the arguments. See torch.types for more details. args (Tuple[Any], optional): The positional arguments passed to the function. Defaults to (). kwargs (Dict[str, Any], optional): The keyword arguments passed to the function. Defaults to {}. - + Returns: Any: The result returned by the function. """ @@ -249,20 +198,19 @@ class LoggingDispatchMode(TorchDispatchMode): print(indent + "{} calling".format(func), flush=True) result = func(*args, **(kwargs or {})) print(indent + "{} called".format(func), flush=True) - - return result - + + return result class CUDAGraphInnerWatcher(TorchDispatchMode): - + def __init__(self, name_list): """ - Initialization function to save the name list to the class attribute. - It also creates a dictionary to keep track of the tensors that have been traced. - + 初始化函数,将传入的名称列表保存到类属性中。 + 同时创建一个字典来记录已经追踪过的张量。 + Args: - name_list (List[str]): A list of names of tensors to be tracked. - + name_list (List[str]): 包含需要追踪的张量名称的列表。 + Returns: None. """ @@ -274,13 +222,13 @@ class CUDAGraphInnerWatcher(TorchDispatchMode): Override the default dispatch behavior of PyTorch tensors to track the tracing process. If the result of a function call is a tensor on CUDA, it will be added to the traced_tensor dictionary with the name of the function. - + Args: func (Callable): The function to be called. types (Tuple[Type]): The type hints of the function. args (Tuple[Any], optional): Positional arguments for the function. Defaults to (). kwargs (Optional[Dict[str, Any]], optional): Keyword arguments for the function. Defaults to None. - + Returns: Any: The result of the function call. """ @@ -292,13 +240,13 @@ class CUDAGraphInnerWatcher(TorchDispatchMode): def __exit__(self, exc_type, exc_val, exc_tb): """ - Clear the traced_tensor and name_list, and call the parent class's __exit__ method. - + 清空 traced_tensor 和 name_list,并调用父类的 __exit__ 方法。 + Args: - exc_type (Optional[Type[BaseException]]): The type of the exception, default is None. - exc_val (Optional[BaseException]): The value of the exception, default is None. - exc_tb (Optional[TracebackType]): he traceback object, default is None. - + exc_type (Optional[Type[BaseException]]): 异常类型,默认为 None。 + exc_val (Optional[BaseException]): 异常值,默认为 None。 + exc_tb (Optional[TracebackType]): Traceback 对象,默认为 None。 + Returns: None. """ @@ -308,11 +256,22 @@ class CUDAGraphInnerWatcher(TorchDispatchMode): self.name_list.clear() super(CUDAGraphInnerWatcher, self).__exit__(exc_type, exc_val, exc_tb) +# def patch_annotations_for_schema(func): +# sig = inspect.signature(func) +# new_params = [] +# for name, param in sig.parameters.items(): +# anno = param.annotation +# if anno == list[int]: +# anno = typing.List[int] +# new_params.append(param.replace(annotation=anno)) +# new_sig = sig.replace(parameters=new_params) +# func.__signature__ = new_sig +# return func def patch_annotations_for_schema(func): """ - At runtime, replace list[int] and Optional[list[int]] in the function signature with typing.List[int] and Optional[typing.List[int]] - so that torch.library.infer_schema can recognize it. + 运行时替换函数签名里的 list[int]、Optional[list[int]] 为 typing.List[int] / Optional[typing.List[int]] + 让 torch.library.infer_schema 能识别 """ sig = inspect.signature(func) new_params = [] @@ -320,7 +279,7 @@ def patch_annotations_for_schema(func): for name, param in sig.parameters.items(): ann = param.annotation - # If it is Optional[T] + # 如果是 Optional[T] if get_origin(ann) is typing.Union and type(None) in get_args(ann): inner_type = [a for a in get_args(ann) if a is not type(None)][0] if get_origin(inner_type) is list: # Optional[list[int]] @@ -328,7 +287,7 @@ def patch_annotations_for_schema(func): new_ann = Optional[List[inner_args[0] if inner_args else typing.Any]] param = param.replace(annotation=new_ann) - # If it is a direct list[int] + # 如果是直接 list[int] elif get_origin(ann) is list: args = get_args(ann) new_ann = List[args[0] if args else typing.Any] @@ -339,23 +298,20 @@ def patch_annotations_for_schema(func): func.__signature__ = sig.replace(parameters=new_params) return func - def supports_custom_op() -> bool: """supports_custom_op""" return hasattr(torch.library, "custom_op") - vllm_lib = Library("vllm", "FRAGMENT") # noqa - def direct_register_custom_op( - op_name: str, - op_func: Callable, - mutates_args: list[str], - fake_impl: Optional[Callable] = None, - target_lib: Optional[Library] = None, - dispatch_key: str = "CUDA", - tags: tuple[torch.Tag, ...] = (), + op_name: str, + op_func: Callable, + mutates_args: list[str], + fake_impl: Optional[Callable] = None, + target_lib: Optional[Library] = None, + dispatch_key: str = "CUDA", + tags: tuple[torch.Tag, ...] = (), ): """ `torch.library.custom_op` can have significant overhead because it @@ -374,28 +330,25 @@ def direct_register_custom_op( """ if not supports_custom_op(): from vllm.platforms import current_platform - assert not current_platform.is_cuda_alike(), ( "cuda platform needs torch>=2.4 to support custom op, " "chances are you are using an old version of pytorch " "or a custom build of pytorch. It is recommended to " "use vLLM in a fresh new environment and let it install " - "the required dependencies." - ) + "the required dependencies.") return import torch.library - if hasattr(torch.library, "infer_schema"): patched_func = patch_annotations_for_schema(op_func) - schema_str = torch.library.infer_schema(op_func, mutates_args=mutates_args) + schema_str = torch.library.infer_schema(op_func, + mutates_args=mutates_args) else: # for pytorch 2.4 import torch._custom_op.impl - schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args) my_lib = target_lib or vllm_lib my_lib.define(op_name + schema_str, tags=tags) my_lib.impl(op_name, op_func, dispatch_key=dispatch_key) if fake_impl is not None: - my_lib._register_fake(op_name, fake_impl) + my_lib._register_fake(op_name, fake_impl) \ No newline at end of file diff --git a/vllm_kunlun/v1/attention/backends/kunlun_attn.py b/vllm_kunlun/v1/attention/backends/kunlun_attn.py index 4371362..612ca71 100644 --- a/vllm_kunlun/v1/attention/backends/kunlun_attn.py +++ b/vllm_kunlun/v1/attention/backends/kunlun_attn.py @@ -1,8 +1,5 @@ # # Copyright (c) 2025 Baidu, Inc. All Rights Reserved. -# Author: Dong Xinyu, Bao Qian, Chen Zhennan, Ma Tianyu, Wang Haowen -# Email: dongxinyu03@baidu.com -# This file is a part of the vllm-kunlun project. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,6 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# This file is a part of the vllm-kunlun project. +# from vllm.config import VllmConfig, get_layers_from_vllm_config import xtorch_ops from dataclasses import dataclass @@ -24,8 +23,8 @@ import torch import numpy as np from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata, AttentionLayer, AttentionType) -from vllm.attention.backends.utils import CommonAttentionState -from vllm.attention.backends.utils import is_block_tables_empty, compute_slot_mapping_start_idx, compute_slot_mapping +# from vllm.attention.backends.utils import CommonAttentionState +# from vllm.attention.backends.utils import is_block_tables_empty, compute_slot_mapping_start_idx, compute_slot_mapping from vllm_kunlun.ops.paged_attn import (PagedAttention, PagedAttentionMetadata) from vllm_kunlun.ops._kunlun_ops import KunlunOps @@ -45,6 +44,7 @@ from vllm.v1.worker.block_table import BlockTable from vllm.config import VllmConfig, get_layers_from_vllm_config + class KunlunAttentionBackend(AttentionBackend): """KunlunAttentionBackend""" # crucial to cuda graph @@ -70,10 +70,10 @@ class KunlunAttentionBackend(AttentionBackend): """get_builder_cls""" return KunlunAttentionMetadataBuilder - @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: - """get_state_cls""" - return CommonAttentionState + # @staticmethod + # def get_state_cls() -> Type["CommonAttentionState"]: + # """get_state_cls""" + # return CommonAttentionState @staticmethod def get_kv_cache_shape( @@ -81,6 +81,7 @@ class KunlunAttentionBackend(AttentionBackend): block_size: int, num_kv_heads: int, head_size: int, + cache_dtype_str: str = "auto" ) -> Tuple[int, ...]: """get_kv_cache_shape""" # return (2, num_blocks, block_size, num_kv_heads * head_size) @@ -132,7 +133,11 @@ class KunlunMetadata(AttentionMetadata, PagedAttentionMetadata): # Cuda-graph is currently enabled for decoding only. # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention. use_cuda_graph: bool + + slot_mapping: torch.Tensor + block_tables: torch.Tensor + multi_modal_placeholder_index_maps: Optional[torch.Tensor] = None # (batch_size,). The sequence length per sequence. Sequence length means # the computed tokens + new tokens None if it is a decoding. seq_lens: Optional[List[int]] = None @@ -143,6 +148,11 @@ class KunlunMetadata(AttentionMetadata, PagedAttentionMetadata): # [4, 6], it is [0, 4, 10]. seq_start_loc: Optional[torch.Tensor] = None + + # Prefix cache loc + kv_lod_cpu: Optional[torch.Tensor] = None + kv_lod_xpu: Optional[torch.Tensor] = None + # (batch_size,) A tensor of context lengths (tokens that are computed # so far). context_lens_tensor: Optional[torch.Tensor] = None @@ -181,6 +191,7 @@ class KunlunMetadata(AttentionMetadata, PagedAttentionMetadata): # Number of tokens input to encoder num_encoder_tokens: Optional[int] = None + enable_kv_scales_calculation: Optional[bool] = False # Cross-attention memory-mapping data structures: slot mapping # and block tables cross_slot_mapping: Optional[torch.Tensor] = None @@ -193,6 +204,11 @@ class KunlunMetadata(AttentionMetadata, PagedAttentionMetadata): use_cascade: Optional[bool] = False seq_lens_tensor_cpu: Optional[torch.Tensor] = None + + num_prefill_tokens: int = 0 + num_decode_tokens: int = 0 + num_prefills: int = 0 + num_decodes: int = 0 def __post_init__(self): """__post_init__""" @@ -253,6 +269,19 @@ class KunlunMetadata(AttentionMetadata, PagedAttentionMetadata): input_positions = (None if self.input_positions is None else self.input_positions[-self.num_prefills:]) + + if self.kv_lod_cpu is None: + kv_lod_cpu = None + kv_lod_xpu = None + else: + start = -(self.num_prefills + 1) + base_cpu = self.kv_lod_cpu[start] + kv_lod_cpu = self.kv_lod_cpu[start:] - base_cpu + + base_xpu = self.kv_lod_xpu[start] + kv_lod_xpu = self.kv_lod_xpu[start:] - base_xpu + + # Construct & cache prefill-phase attention metadata structure self._cached_prefill_metadata = KunlunMetadata( num_actual_tokens=self.num_actual_tokens, @@ -264,7 +293,9 @@ class KunlunMetadata(AttentionMetadata, PagedAttentionMetadata): slot_mapping=slot_mapping, seq_lens=seq_lens, seq_lens_tensor=seq_lens_tensor, - seq_start_loc=None, + seq_start_loc = None, + kv_lod_cpu=kv_lod_cpu, + kv_lod_xpu=kv_lod_xpu, max_query_len=self.max_query_len, max_kv_len=self.max_kv_len, max_prefill_seq_len=self.max_prefill_seq_len, @@ -413,18 +444,28 @@ class KunlunAttentionMetadataBuilder: self._num_decode_tokens = num_decode_tokens self._num_prefill_tokens = num_prefill_tokens return modified_batch + + def build_for_cudagraph_capture( + self, common_attn_metadata: CommonAttentionMetadata + ) -> KunlunMetadata: + attn_metadata = self.build(0, common_attn_metadata) + # When doing full graph capture, setting seq_lens to + # max_model_len will cause graph capture to be extremely + # slow, so here we set it to 1. + attn_metadata.seq_lens_tensor.fill_(1) + return attn_metadata def build(self, common_prefix_len: int, common_attn_metadata: CommonAttentionMetadata): """build""" - num_reqs=common_attn_metadata.num_reqs - num_actual_tokens=common_attn_metadata.num_actual_tokens - max_query_len=common_attn_metadata.max_query_len - common_prefix_len=common_prefix_len + num_reqs = common_attn_metadata.num_reqs + num_actual_tokens = common_attn_metadata.num_actual_tokens + max_query_len = common_attn_metadata.max_query_len + common_prefix_len = common_prefix_len block_table_tensor = common_attn_metadata.block_table_tensor slot_mapping = common_attn_metadata.slot_mapping - + max_seq_len = int(common_attn_metadata.seq_lens_cpu.max()) query_start_loc_host = common_attn_metadata.query_start_loc_cpu[:num_reqs + 1] query_start_loc = common_attn_metadata.query_start_loc_cpu[:num_reqs + 1].to( @@ -432,18 +473,18 @@ class KunlunAttentionMetadataBuilder: seq_lens = common_attn_metadata.seq_lens seq_lens_cpu = common_attn_metadata.seq_lens_cpu - + seq_start_loc = list(accumulate(seq_lens, initial=0)) - - if len(seq_start_loc) != num_reqs + 1: - seq_start_loc = query_start_loc_host.tolist() - - if seq_start_loc[-1] != num_actual_tokens: - seq_start_loc = query_start_loc_host.tolist() - + + + seq_start_loc_tensor = torch.empty(len(seq_start_loc), dtype=torch.int32, device=self.device) seq_start_loc_tensor.copy_(torch.as_tensor(seq_start_loc, dtype=torch.int32)) + kv_lod_cpu = torch.zeros(num_reqs + 1, dtype=torch.int32, device="cpu") + kv_lod_cpu[1:] = seq_lens_cpu.to(torch.int32).cumsum(dim=0) + kv_lod_xpu = kv_lod_cpu.to(self.device) + num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens =\ split_decodes_and_prefills(common_attn_metadata) @@ -456,6 +497,7 @@ class KunlunAttentionMetadataBuilder: max_decode_seq_len = np.max(tmp_decode_scheduled_tokens) tmp_prefill_scheduled_tokens = num_scheduled_tokens[num_decodes: num_reqs] + if num_prefill_tokens == 0: max_prefill_seq_len = 0 else: @@ -473,6 +515,8 @@ class KunlunAttentionMetadataBuilder: num_decode_tokens=num_decode_tokens, seq_lens_tensor=seq_lens, seq_lens_tensor_cpu=seq_lens_cpu, + kv_lod_xpu=kv_lod_xpu, + kv_lod_cpu=kv_lod_cpu, max_query_len=max_prefill_seq_len, max_prefill_seq_len=max_prefill_seq_len, max_decode_seq_len=max_decode_seq_len, @@ -483,7 +527,6 @@ class KunlunAttentionMetadataBuilder: use_cuda_graph=False, use_cascade=use_cascade, ) - return attn_metadata def can_run_in_cudagraph( @@ -514,11 +557,15 @@ class KunlunAttentionImpl(AttentionImpl[KunlunMetadata]): attn_type: AttentionType = AttentionType.DECODER, use_irope: bool = False, sinks:Optional[torch.Tensor]= None, + multi_modal_placeholder_index_maps:Optional[torch.Tensor]= None, ) -> None: """__init__""" if blocksparse_params is not None: raise ValueError( "kunlunAttention does not support block-sparse attention.") + # if logits_soft_cap is not None: + # raise ValueError( + # "kunlunAttention does not support attention logits soft capping.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) @@ -547,6 +594,7 @@ class KunlunAttentionImpl(AttentionImpl[KunlunMetadata]): "Sinks must have the same number of heads as the number of " f"heads in the layer. Sinks shape: {sinks.shape}, " f"num_heads: {num_heads}.") + self.multi_modal_placeholder_index_maps = multi_modal_placeholder_index_maps def forward( self, @@ -560,7 +608,8 @@ class KunlunAttentionImpl(AttentionImpl[KunlunMetadata]): v_scale: float = 1.0, attn_type: AttentionType = AttentionType.DECODER, output: Optional[torch.Tensor] = None, - output_scale: Optional[torch.Tensor] = None + output_scale: Optional[torch.Tensor] = None, + output_block_scale: Optional[torch.Tensor] = None ) -> torch.Tensor: """forward""" query = query.view(-1, self.num_heads, self.head_size) @@ -597,12 +646,22 @@ class KunlunAttentionImpl(AttentionImpl[KunlunMetadata]): # If kv_cache is not provided, the new key and value tensors are # not cached. This happens during the initial memory value = value.contiguous() - xtorch_ops.reshape_and_cache( - key, - value, - key_cache, - value_cache, - updated_slot_mapping) + if key_cache.is_contiguous(): + xtorch_ops.reshape_and_cache( + key, + value, + key_cache, + value_cache, + updated_slot_mapping) + else: + cast_key_cache = key_cache.squeeze(1).unsqueeze(-2) + cast_value_cache = value_cache.squeeze(1).unsqueeze(-2) + xtorch_ops.reshape_and_cache_flash( + key, + value, + cast_key_cache, + cast_value_cache, + updated_slot_mapping) assert attn_type == AttentionType.DECODER # Decoder self-attention supports chunked prefill. @@ -614,22 +673,38 @@ class KunlunAttentionImpl(AttentionImpl[KunlunMetadata]): if prefill_meta := attn_metadata.prefill_metadata: # Prompt run. prefill_query = query[num_decode_tokens:attn_metadata.num_actual_tokens] - prefill_key = key[num_decode_tokens:attn_metadata.num_actual_tokens] - prefill_value = value[num_decode_tokens:attn_metadata.num_actual_tokens] - assert prefill_query.shape[0] == num_prefill_tokens - output[num_decode_tokens:attn_metadata.num_actual_tokens] = KunlunOps.multi_query_kv_attention( - prefill_meta.query_start_loc,prefill_meta.query_start_loc_host, prefill_query, prefill_key, prefill_value, - alibi_slopes=self.alibi_slopes).view_as(prefill_query) + xtorch_ops.prefill_attention( + q=prefill_query, + k=key_cache, # Key Cache (block_num, head, block_size, dim) + v=value_cache, + out=output[num_decode_tokens:attn_metadata.num_actual_tokens], + is_causal=True, + is_prefix_cache=True, + block_table=prefill_meta.block_tables, + context_qlen_lod_cpu=prefill_meta.query_start_loc_host, + context_qlen_lod_xpu=prefill_meta.query_start_loc, + context_kvlen_lod_cpu=prefill_meta.kv_lod_cpu, + context_kvlen_lod_xpu=prefill_meta.kv_lod_xpu, + alibi_slopes=self.alibi_slopes, + softmax_lse=None, + sink=self.sinks + ) + if decode_meta := attn_metadata.decode_metadata: assert attn_type != AttentionType.ENCODER_ONLY, ( "Encoder-only models should not have decode metadata.") decode_query = query[:num_decode_tokens] - + + if key_cache.is_contiguous(): + tmp_block_tables = decode_meta.block_tables + else: + tmp_block_tables = decode_meta.block_tables * 2 # only test in Qwen3-Next + xtorch_ops.paged_attention( x=decode_query, k_cache=key_cache, v_cache=value_cache, - block_tables=decode_meta.block_tables, + block_tables=tmp_block_tables, context_lens_cpu=decode_meta.seq_lens_tensor_cpu, context_lens_xpu=decode_meta.seq_lens_tensor, is_context=False, @@ -639,7 +714,6 @@ class KunlunAttentionImpl(AttentionImpl[KunlunMetadata]): ) # Reshape the output tensor. return output.view(-1, self.num_heads * self.head_size) - def use_cascade_attention( common_prefix_len: int, query_lens: np.ndarray, @@ -650,13 +724,18 @@ def use_cascade_attention( num_sms: int, use_local_attention: bool = False, ) -> bool: - """ - TODO: Not Yet Supported on Kunlun platform + """Decide whether to use cascade attention. + + This function 1) checks whether cascade attention is supported with the + given configuration, and 2) heuristically decides whether using cascade + attention can improve performance. """ # Too short common prefix. Probably not worth using cascade attention. # We use an arbitrary threshold of 256 tokens. TODO: Tune this threshold. # NOTE(woosuk): This is the common case. We should return False as soon as # possible to avoid any unnecessary computation. + return False + if common_prefix_len < 256: return False # Cascade attention is currently not supported with these variants. diff --git a/vllm_kunlun/v1/sample/ops/penalties.py b/vllm_kunlun/v1/sample/ops/penalties.py deleted file mode 100644 index 55f4ff7..0000000 --- a/vllm_kunlun/v1/sample/ops/penalties.py +++ /dev/null @@ -1,91 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import torch -from vllm.utils import is_pin_memory_available, make_tensor_with_pad - - -def get_token_bin_counts_and_mask( - tokens: torch.Tensor, - vocab_size: int, - num_seqs: int, -) -> tuple[torch.Tensor, torch.Tensor]: - # Compute the bin counts for the tokens. - # vocab_size + 1 for padding. - bin_counts = torch.zeros((num_seqs, vocab_size + 1), - dtype=torch.long, - device=tokens.device) - bin_counts.scatter_add_(1, tokens, torch.ones_like(tokens)) - bin_counts = bin_counts[:, :vocab_size] - mask = bin_counts > 0 - - return bin_counts, mask - -def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor, - output_tokens_tensor: torch.Tensor, - presence_penalties: torch.Tensor, - frequency_penalties: torch.Tensor, - repetition_penalties: torch.Tensor) -> torch.Tensor: - """ - Applies penalties in place to the logits tensor - logits : The input logits tensor of shape [num_seqs, vocab_size] - prompt_tokens_tensor: A tensor containing the prompt tokens. The prompts - are padded to the maximum prompt length within the batch using - `vocab_size` as the padding value. The value `vocab_size` is used - for padding because it does not correspond to any valid token ID - in the vocabulary. - output_tokens_tensor: The output tokens tensor. - presence_penalties: The presence penalties of shape (num_seqs, ) - frequency_penalties: The frequency penalties of shape (num_seqs, ) - repetition_penalties: The repetition penalties of shape (num_seqs, ) - """ - num_seqs, vocab_size = logits.shape - _, prompt_mask = get_token_bin_counts_and_mask(prompt_tokens_tensor, - vocab_size, num_seqs) - output_bin_counts, output_mask = get_token_bin_counts_and_mask( - output_tokens_tensor, vocab_size, num_seqs) - - # Apply repetition penalties as a custom op - from vllm._custom_ops import apply_repetition_penalties_torch - apply_repetition_penalties_torch(logits, prompt_mask, output_mask, - repetition_penalties) - - # We follow the definition in OpenAI API. - # Refer to https://platform.openai.com/docs/api-reference/parameter-details - logits -= frequency_penalties.unsqueeze(dim=1) * output_bin_counts - logits -= presence_penalties.unsqueeze(dim=1) * output_mask - return logits - -def apply_all_penalties( - logits: torch.Tensor, - prompt_token_ids: torch.Tensor, - presence_penalties: torch.Tensor, - frequency_penalties: torch.Tensor, - repetition_penalties: torch.Tensor, - output_token_ids: list[list[int]], -) -> torch.Tensor: - """ - Applies presence, frequency and repetition penalties to the logits. - """ - _, vocab_size = logits.shape - output_tokens_t = _convert_to_tensors(output_token_ids, vocab_size, - logits.device) - return apply_penalties(logits, prompt_token_ids, output_tokens_t, - presence_penalties, frequency_penalties, - repetition_penalties) - -def _convert_to_tensors(output_token_ids: list[list[int]], vocab_size: int, - device: torch.device) -> torch.Tensor: - """ - Convert the different list data structures to tensors. - """ - output_tokens_tensor = make_tensor_with_pad( - output_token_ids, - # Use the value of vocab_size as a pad since we don't have a - # token_id of this value. - pad=vocab_size, - device="cpu", - dtype=torch.int64, - pin_memory=is_pin_memory_available(), - ) - return output_tokens_tensor.to(device, non_blocking=True) diff --git a/vllm_kunlun/v1/sample/ops/topk_topp_sampler.py b/vllm_kunlun/v1/sample/ops/topk_topp_sampler.py index e175040..64516ff 100644 --- a/vllm_kunlun/v1/sample/ops/topk_topp_sampler.py +++ b/vllm_kunlun/v1/sample/ops/topk_topp_sampler.py @@ -22,7 +22,7 @@ class TopKTopPSampler(nn.Module): Implementations may update the logits tensor in-place. """ - def __init__(self): + def __init__(self, logprobs_mode): super().__init__() logger.info_once( "Using FlashInfer for top-p & top-k sampling.") @@ -57,7 +57,7 @@ class TopKTopPSampler(nn.Module): # not needed. This is because `random_sample` does not require # CPU-GPU synchronization while `flashinfer_sample` does. probs = logits.softmax(dim=-1, dtype=torch.float32) - return random_sample(probs, generators) + return random_sample(probs, generators), None if generators: logger.warning_once("FlashInfer 0.2.3+ does not support " "per-request generators. Falling back to " @@ -66,8 +66,7 @@ class TopKTopPSampler(nn.Module): # flashinfer sampling functions expect contiguous logits. # In flex_attn/triton_attn fp32 inference, logits can be non-contiguous # because of slicing operation in logits_processor. - return flashinfer_sample(logits.contiguous(), k, p, generators) - + return flashinfer_sample(logits.contiguous(), k, p, generators), None def apply_top_k_top_p( @@ -195,4 +194,4 @@ def flashinfer_sample( next_token_ids = xtorch_ops.top_k_top_p_sampling_from_probs( probs, top_k=k, top_p=p, deterministic=True) - return next_token_ids.view(-1) + return next_token_ids.view(-1) \ No newline at end of file diff --git a/vllm_kunlun/v1/worker/gpu_model_runner.py b/vllm_kunlun/v1/worker/gpu_model_runner.py new file mode 100644 index 0000000..637cc12 --- /dev/null +++ b/vllm_kunlun/v1/worker/gpu_model_runner.py @@ -0,0 +1,4184 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import gc +import itertools +import time +from collections import defaultdict +from collections.abc import Iterator +from contextlib import contextmanager +from copy import deepcopy +from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union, cast + +import numpy as np +import torch +import torch.distributed +import torch.nn as nn +import vllm.envs as envs +from tqdm import tqdm +from typing_extensions import TypeAlias +from vllm.attention import Attention, AttentionType +from vllm.attention.backends.abstract import AttentionBackend +from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention +from vllm.compilation.counter import compilation_counter +from vllm.compilation.cuda_graph import CUDAGraphWrapper +from vllm.compilation.monitor import set_cudagraph_capturing_enabled +from vllm.config import (CompilationLevel, CUDAGraphMode, VllmConfig, + get_layers_from_vllm_config, update_config) +from vllm.distributed.eplb.eplb_state import EplbState +from vllm.distributed.kv_transfer import (get_kv_transfer_group, + has_kv_transfer_group) +from vllm.distributed.kv_transfer.kv_connector.utils import copy_kv_blocks +from vllm.distributed.parallel_state import ( + get_pp_group, get_tp_group, graph_capture, is_global_first_rank, + prepare_communication_buffer_for_model) +from vllm.forward_context import (BatchDescriptor, DPMetadata, + set_forward_context) +from vllm.logger import init_logger +from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase +from vllm.model_executor.layers.mamba.abstract import MambaBase +from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding +from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader +from vllm.model_executor.models.deepseek_v2 import DeepseekV32IndexerCache +# yapf conflicts with isort for this block +# yapf: disable +from vllm.model_executor.models.interfaces import (SupportsMultiModal, + is_mixture_of_experts, + supports_eagle3, + supports_mrope, + supports_multimodal_pruning, + supports_transcription) +# yapf: enable +from vllm.model_executor.models.interfaces_base import ( + VllmModelForPooling, is_pooling_model, is_text_generation_model) +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargsItem, + PlaceholderRange) +from vllm.multimodal.utils import group_mm_kwargs_by_modality +from vllm.pooling_params import PoolingParams +from vllm.sampling_params import SamplingType +from vllm.sequence import IntermediateTensors +from vllm.tasks import GenerationTask, PoolingTask, SupportedTask +from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, + GiB_bytes, cdiv, check_use_alibi, get_dtype_size, + is_pin_memory_available, + length_from_prompt_token_ids_or_embeds, round_up, + supports_dynamo) +from vllm.utils.jsontree import json_map_leaves +from vllm.v1.attention.backends.flash_attn import AttentionMetadata +from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder +from vllm.v1.attention.backends.utils import ( + AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, + create_fast_prefill_custom_backend, + reorder_batch_to_split_decodes_and_prefills, split_attn_metadata) +from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher +# yapf conflicts with isort for this block +# yapf: disable +from vllm.v1.kv_cache_interface import (AttentionSpec, + ChunkedLocalAttentionSpec, + CrossAttentionSpec, + EncoderOnlyAttentionSpec, + FullAttentionSpec, KVCacheConfig, + KVCacheGroupSpec, KVCacheSpec, + MambaSpec, MLAAttentionSpec, + SlidingWindowSpec, + UniformTypeKVCacheSpecs) +# yapf: enable +from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput, + DraftTokenIds, LogprobsLists, LogprobsTensors, + ModelRunnerOutput, PoolerOutput, SamplerOutput) +from vllm.v1.pool.metadata import PoolingMetadata +from vllm.v1.sample.logits_processor import LogitsProcessors, build_logitsprocs +from vllm.v1.sample.metadata import SamplingMetadata +from vllm.v1.sample.rejection_sampler import RejectionSampler +from vllm.v1.sample.sampler import Sampler +from vllm.v1.spec_decode.eagle import EagleProposer +from vllm.v1.spec_decode.medusa import MedusaProposer +from vllm.v1.spec_decode.metadata import SpecDecodeMetadata +from vllm.v1.spec_decode.ngram_proposer import NgramProposer +from vllm.v1.structured_output.utils import apply_grammar_bitmask +from vllm.v1.utils import CpuGpuBuffer, record_function_or_nullcontext +from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch +from vllm.v1.worker.gpu_ubatch_wrapper import UBatchWrapper +from vllm.v1.worker.kv_connector_model_runner_mixin import \ + KVConnectorModelRunnerMixin +from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin +from vllm.v1.worker.ubatch_splitting import (check_ubatch_thresholds, + ubatch_split) +from vllm.v1.worker.ubatch_utils import UBatchSlice, UBatchSlices +from vllm.v1.worker.utils import (AttentionGroup, MultiModalBudget, + add_kv_sharing_layers_to_kv_cache_groups, + bind_kv_cache, gather_mm_placeholders, + is_residual_scattered_for_sp, + sanity_check_mm_encoder_outputs, + scatter_mm_placeholders) + +if TYPE_CHECKING: + from vllm.model_executor.model_loader.tensorizer import TensorizerConfig + from vllm.v1.core.sched.output import SchedulerOutput + +logger = init_logger(__name__) + +AttnMetadataDict: TypeAlias = dict[str, AttentionMetadata] +# list when ubatching is enabled +PerLayerAttnMetadata: TypeAlias = Union[list[AttnMetadataDict], + AttnMetadataDict] + + +# Wrapper for ModelRunnerOutput to support overlapped execution. +class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput): + + def __init__( + self, + model_runner_output: ModelRunnerOutput, + sampled_token_ids: torch.Tensor, + invalid_req_indices: list[int], + async_output_copy_stream: torch.cuda.Stream, + ): + self._model_runner_output = model_runner_output + self._invalid_req_indices = invalid_req_indices + + # Event on the copy stream so we can synchronize the non-blocking copy. + self._async_copy_ready_event = torch.cuda.Event() + + # Keep a reference to the device tensor to avoid it being + # deallocated until we finish copying it to the host. + self._sampled_token_ids = sampled_token_ids + + # Initiate the copy on a separate stream, but do not synchronize it. + default_stream = torch.cuda.current_stream() + with torch.cuda.stream(async_output_copy_stream): + async_output_copy_stream.wait_stream(default_stream) + self._sampled_token_ids_cpu = self._sampled_token_ids.to( + 'cpu', non_blocking=True) + self._async_copy_ready_event.record() + + def get_output(self) -> ModelRunnerOutput: + """Copy the device tensors to the host and return a ModelRunnerOutput. + + This function blocks until the copy is finished. + """ + self._async_copy_ready_event.synchronize() + + # Release the device tensor once the copy has completed + del self._sampled_token_ids + + valid_sampled_token_ids = self._sampled_token_ids_cpu.tolist() + for i in self._invalid_req_indices: + valid_sampled_token_ids[i].clear() + + output = self._model_runner_output + output.sampled_token_ids = valid_sampled_token_ids + return output + + +class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): + + def __init__( + self, + vllm_config: VllmConfig, + device: torch.device, + ): + self.vllm_config = vllm_config + self.model_config = vllm_config.model_config + self.cache_config = vllm_config.cache_config + self.compilation_config = vllm_config.compilation_config + self.lora_config = vllm_config.lora_config + self.load_config = vllm_config.load_config + self.parallel_config = vllm_config.parallel_config + self.scheduler_config = vllm_config.scheduler_config + self.speculative_config = vllm_config.speculative_config + self.observability_config = vllm_config.observability_config + + from vllm.model_executor.models.utils import set_cpu_offload_max_bytes + set_cpu_offload_max_bytes( + int(self.cache_config.cpu_offload_gb * 1024**3)) + + model_config = self.model_config + cache_config = self.cache_config + scheduler_config = self.scheduler_config + parallel_config = self.parallel_config + self.device = device + self.pin_memory = is_pin_memory_available() + self.dtype = self.model_config.dtype + if cache_config.cache_dtype == "auto": + self.kv_cache_dtype = self.dtype + else: + self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[ + cache_config.cache_dtype] + + self.is_pooling_model = (model_config.runner_type == 'pooling') + self.enable_prompt_embeds = model_config.enable_prompt_embeds + self.is_multimodal_raw_input_only_model = ( + model_config.is_multimodal_raw_input_only_model) + # This will be overridden in load_model() + self.is_multimodal_pruning_enabled = False + self.max_model_len = model_config.max_model_len + self.dcp_world_size = self.parallel_config.decode_context_parallel_size + self.max_num_tokens = scheduler_config.max_num_batched_tokens + self.max_num_reqs = scheduler_config.max_num_seqs + + # Broadcast PP output for external_launcher (torchrun) + # to make sure we are synced across pp ranks + # TODO: Support overlapping mirco-batches + # https://github.com/vllm-project/vllm/issues/18019 + self.broadcast_pp_output = ( + self.parallel_config.distributed_executor_backend + == "external_launcher" and len(get_pp_group().ranks) > 0) + + # Model-related. + self.num_query_heads = model_config.get_num_attention_heads( + parallel_config) + self.hidden_size = model_config.get_hidden_size() + self.attention_chunk_size = model_config.attention_chunk_size + # Only relevant for models using ALiBi (e.g, MPT) + self.use_alibi = check_use_alibi(model_config) + + self.cascade_attn_enabled = not self.model_config.disable_cascade_attn + + # Multi-modal data support + self.mm_registry = MULTIMODAL_REGISTRY + self.uses_mrope = model_config.uses_mrope + self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs( + model_config) + + if self.model_config.is_encoder_decoder: + # Maximum length of the encoder input, only for encoder-decoder + # models. + self.max_encoder_len = scheduler_config.\ + max_num_encoder_input_tokens + else: + self.max_encoder_len = 0 + + # Sampler + self.sampler = Sampler(logprobs_mode=self.model_config.logprobs_mode) + + self.eplb_state: Optional[EplbState] = None + """ + State of the expert parallelism load balancer. + + Will be lazily initialized when the model is loaded. + """ + + # Lazy initializations + # self.model: nn.Module # Set after load_model + # Initialize in initialize_kv_cache + self.kv_caches: list[torch.Tensor] = [] + # indexes: [kv_cache_group_id][attn_group] + self.attn_groups: list[list[AttentionGroup]] = [] + # self.kv_cache_config: KVCacheConfig + + # mm_hash -> encoder_output + self.encoder_cache: dict[str, torch.Tensor] = {} + + self.use_aux_hidden_state_outputs = False + # Set up speculative decoding. + # NOTE(Jiayi): currently we put the entire draft model on + # the last PP rank. This is not ideal if there are many + # layers in the draft model. + if self.speculative_config and get_pp_group().is_last_rank: + if self.speculative_config.method == "ngram": + self.drafter = NgramProposer(self.vllm_config) + elif self.speculative_config.use_eagle(): + self.drafter = EagleProposer(self.vllm_config, self.device, + self) # type: ignore + if self.speculative_config.method == "eagle3": + self.use_aux_hidden_state_outputs = True + elif self.speculative_config.method == "medusa": + self.drafter = MedusaProposer( + vllm_config=self.vllm_config, + device=self.device) # type: ignore + else: + raise ValueError("Unknown speculative decoding method: " + f"{self.speculative_config.method}") + self.rejection_sampler = RejectionSampler() + + # Request states. + self.requests: dict[str, CachedRequestState] = {} + self.comm_stream = torch.cuda.Stream() + + # Input Batch + # NOTE(Chen): Ideally, we should initialize the input batch inside + # `initialize_kv_cache` based on the kv cache config. However, as in + # https://github.com/vllm-project/vllm/pull/18298, due to some unknown + # reasons, we have to initialize the input batch before `load_model`, + # quantization + weight offloading will fail otherwise. As a temporary + # solution, we initialize the input batch here, and re-initialize it + # in `initialize_kv_cache` if the block_sizes here is different from + # the block_sizes in the kv cache config. + self.input_batch = InputBatch( + max_num_reqs=self.max_num_reqs, + # We need to use the encoder length for encoder-decoer + # because of KV cache for cross-attention. + max_model_len=max(self.max_model_len, self.max_encoder_len), + max_num_batched_tokens=self.max_num_tokens, + device=self.device, + pin_memory=self.pin_memory, + vocab_size=self.model_config.get_vocab_size(), + block_sizes=[self.cache_config.block_size], + is_spec_decode=bool(self.vllm_config.speculative_config), + logitsprocs=build_logitsprocs( + self.vllm_config, self.device, self.pin_memory, + self.is_pooling_model, + self.vllm_config.model_config.logits_processors), + is_pooling_model=self.is_pooling_model, + ) + + self.use_async_scheduling = self.scheduler_config.async_scheduling + self.async_output_copy_stream = torch.cuda.Stream() if \ + self.use_async_scheduling else None + + # TODO(woosuk): Provide an option to tune the max cudagraph batch size. + # The convention is different. + # self.cudagraph_batch_sizes sorts in ascending order. + # The batch sizes in the config are in descending order. + if self.compilation_config.cudagraph_capture_sizes and \ + self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE: + self.cudagraph_batch_sizes = list( + reversed(self.compilation_config.cudagraph_capture_sizes)) + + # Cache the device properties. + self._init_device_properties() + + # Persistent buffers for CUDA graphs. + self.input_ids = self._make_buffer(self.max_num_tokens, + dtype=torch.int32) + self.positions = self._make_buffer(self.max_num_tokens, + dtype=torch.int64) + self.query_start_loc = self._make_buffer(self.max_num_reqs + 1, + dtype=torch.int32) + self.seq_lens = self._make_buffer(self.max_num_reqs, dtype=torch.int32) + # Because inputs_embeds may be bfloat16 and we don't need a numpy + # version of this tensor, avoid a RuntimeError by not creating a + # numpy buffer. + self.inputs_embeds = self._make_buffer(self.max_num_tokens, + self.hidden_size, + dtype=self.dtype, + numpy=False) + self.is_token_ids = self._make_buffer(self.max_num_tokens, + dtype=torch.bool) + self.discard_request_indices = self._make_buffer(self.max_num_reqs, + dtype=torch.int64) + self.num_discarded_requests = 0 + + self.num_decode_draft_tokens = self._make_buffer(self.max_num_reqs, + dtype=torch.int32) + self.num_accepted_tokens = self._make_buffer(self.max_num_reqs, + dtype=torch.int64) + # Only relevant for multimodal models + if self.supports_mm_inputs: + self.is_mm_embed = self._make_buffer(self.max_num_tokens, + dtype=torch.bool) + + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + if self.uses_mrope: + # NOTE: `mrope_positions` is implemented with one additional dummy + # position on purpose to make it non-contiguous so that it can work + # with torch compile. + # See detailed explanation in https://github.com/vllm-project/vllm/pull/12128#discussion_r1926431923 + + # NOTE: When M-RoPE is enabled, position ids are 3D regardless of + # the modality of inputs. For text-only inputs, each dimension has + # identical position IDs, making M-RoPE functionally equivalent to + # 1D-RoPE. + # See page 5 of https://arxiv.org/abs/2409.12191 + self.mrope_positions = self._make_buffer( + (3, self.max_num_tokens + 1), dtype=torch.int64) + + # CUDA event to synchronize use of reused CPU tensors between steps + # when async scheduling is enabled. + self.prepare_inputs_event: Optional[torch.cuda.Event] = None + if self.use_async_scheduling: + self.prepare_inputs_event = torch.cuda.Event() + # Start in a completed state. + self.prepare_inputs_event.record(torch.cuda.default_stream()) + + # None in the first PP rank. The rest are set after load_model. + self.intermediate_tensors: Optional[IntermediateTensors] = None + + # OPTIMIZATION: Cache the tensors rather than creating them every step. + # Keep in int64 to avoid overflow with long context + self.arange_np = np.arange(max(self.max_num_reqs + 1, + self.max_model_len, + self.max_num_tokens), + dtype=np.int64) + + # Layer pairings for cross-layer KV sharing. + # If an Attention layer `layer_name` is in the keys of this dict, it + # means this layer will perform attention using the keys and values + # from the KV cache of `shared_kv_cache_layers[layer_name]`. + self.shared_kv_cache_layers: dict[str, str] = {} + self.kv_sharing_fast_prefill_eligible_layers: set[str] = set() + + self.kv_sharing_fast_prefill_logits_indices = None + if self.cache_config.kv_sharing_fast_prefill: + self.kv_sharing_fast_prefill_logits_indices = torch.zeros( + self.max_num_tokens, dtype=torch.int32, device=self.device) + + self.uniform_decode_query_len = 1 if not self.speculative_config else \ + 1 + self.speculative_config.num_speculative_tokens + + # Cudagraph dispatcher for runtime cudagraph dispatching. + self.cudagraph_dispatcher = CudagraphDispatcher(self.vllm_config) + + self.mm_budget = MultiModalBudget( + self.model_config, + self.scheduler_config, + self.mm_registry, + ) if self.supports_mm_inputs else None + + self.reorder_batch_threshold: Optional[int] = None + + # Attention layers that are only in the KVCacheConfig of the runner + # (e.g., KV sharing, encoder-only attention), but not in the + # KVCacheConfig of the scheduler. + self.runner_only_attn_layers: set[str] = set() + + # Cached outputs. + self._draft_token_ids: Optional[Union[list[list[int]], + torch.Tensor]] = None + self.transfer_event = torch.cuda.Event() + self.sampled_token_ids_pinned_cpu = torch.empty( + (self.max_model_len, 1), + dtype=torch.int64, + device="cpu", + pin_memory=self.pin_memory) + + def _make_buffer(self, + *size: Union[int, torch.SymInt], + dtype: torch.dtype, + numpy: bool = True) -> CpuGpuBuffer: + return CpuGpuBuffer(*size, + dtype=dtype, + device=self.device, + pin_memory=self.pin_memory, + with_numpy=numpy) + + def _init_model_kwargs(self, num_tokens: int): + model_kwargs = dict[str, Any]() + + if not self.is_pooling_model: + return model_kwargs + + num_reqs = self.input_batch.num_reqs + pooling_params = self.input_batch.get_pooling_params() + + token_type_id_requests = dict[int, Any]() + for i, param in enumerate(pooling_params): + if param.extra_kwargs is not None and \ + (token_types := param.extra_kwargs.get( + "compressed_token_type_ids")) is not None: + token_type_id_requests[i] = token_types + + if len(token_type_id_requests) == 0: + return model_kwargs + + seq_lens = self.seq_lens.gpu[:num_reqs] + token_type_ids = [] + + for i in range(num_reqs): + pos = token_type_id_requests.get(i, seq_lens[i]) + ids = (torch.arange(seq_lens[i]) >= pos).int() + token_type_ids.append(ids) + + model_kwargs["token_type_ids"] = torch.concat(token_type_ids).to( + device=self.device) + return model_kwargs + + def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None: + """ + Update the order of requests in the batch based on the attention + backend's needs. For example, some attention backends (namely MLA) may + want to separate requests based on if the attention computation will be + compute-bound or memory-bound. + + Args: + scheduler_output: The scheduler output. + """ + # Attention free models have zero kv_cache_goups, however models + # like Mamba are also attention free but use the kv_cache for + # keeping its internal state. This is why we check the number + # of kv_cache groups instead of solely checking + # for self.model_config.is_attention_free. + if len(self.kv_cache_config.kv_cache_groups) == 0: + return + + if self.reorder_batch_threshold is not None: + # NOTE(lucas): currently no backend supports the custom masking + # required for DCP with q_len > 1, so we assert here. Remove this + # assert once the custom mask is support is added to FA3. + if self.dcp_world_size > 1: + assert self.reorder_batch_threshold == 1, \ + "DCP not support reorder_batch_threshold > 1 now." + reorder_batch_to_split_decodes_and_prefills( + self.input_batch, + scheduler_output, + decode_threshold=self.reorder_batch_threshold) + + # Note: used for model runner override. + def _init_device_properties(self) -> None: + """Initialize attributes from torch.cuda.get_device_properties + """ + self.device_properties = torch.cuda.get_device_properties(self.device) + self.num_sms = self.device_properties.multi_processor_count + + # Note: used for model runner override. + def _sync_device(self) -> None: + torch.cuda.synchronize() + + def _update_states(self, scheduler_output: "SchedulerOutput") -> None: + """Update the cached states and the persistent batch with the scheduler + output. + + The updated states are used by the `_prepare_inputs` function to create + the input GPU tensors for the model. + + The SamplingMetadata is updated and copied to the GPU if there is a + new/resumed/paused/finished request in the batch. + """ + # Remove finished requests from the cached states. + for req_id in scheduler_output.finished_req_ids: + self.requests.pop(req_id, None) + # Remove the finished requests from the persistent batch. + # NOTE(woosuk): There could be an edge case where finished_req_ids and + # scheduled_req_ids overlap. This happens when a request is aborted and + # then resubmitted with the same ID. In this case, we treat them as two + # distinct requests - clearing the cached states for the first request + # and handling the second as a new request. + for req_id in scheduler_output.finished_req_ids: + self.input_batch.remove_request(req_id) + + # Free the cached encoder outputs. + for mm_hash in scheduler_output.free_encoder_mm_hashes: + self.encoder_cache.pop(mm_hash, None) + + # Remove the unscheduled requests from the persistent batch. + # NOTE(woosuk): The unscheduled requests are either preempted requests + # or running requests that are not scheduled in this step. We remove + # them from the persistent batch but keep their cached states since + # they will be scheduled again sometime in the future. + scheduled_req_ids = scheduler_output.num_scheduled_tokens.keys() + cached_req_ids = self.input_batch.req_id_to_index.keys() + unscheduled_req_ids = cached_req_ids - scheduled_req_ids + # NOTE(woosuk): The persistent batch optimization assumes that + # consecutive batches contain mostly the same requests. If batches + # have low request overlap (e.g., alternating between two distinct + # sets of requests), this optimization becomes very inefficient. + for req_id in unscheduled_req_ids: + self.input_batch.remove_request(req_id) + + reqs_to_add: list[CachedRequestState] = [] + # Add new requests to the cached states. + for new_req_data in scheduler_output.scheduled_new_reqs: + req_id = new_req_data.req_id + sampling_params = new_req_data.sampling_params + pooling_params = new_req_data.pooling_params + + if sampling_params and \ + sampling_params.sampling_type == SamplingType.RANDOM_SEED: + generator = torch.Generator(device=self.device) + generator.manual_seed(sampling_params.seed) + else: + generator = None + + if self.is_pooling_model: + assert pooling_params is not None + task = pooling_params.task + assert task is not None, "You did not set `task` in the API" + + model = cast(VllmModelForPooling, self.get_model()) + to_update = model.pooler.get_pooling_updates(task) + to_update.apply(pooling_params) + + req_state = CachedRequestState( + req_id=req_id, + prompt_token_ids=new_req_data.prompt_token_ids, + prompt_embeds=new_req_data.prompt_embeds, + mm_features=new_req_data.mm_features, + sampling_params=sampling_params, + pooling_params=pooling_params, + generator=generator, + block_ids=new_req_data.block_ids, + num_computed_tokens=new_req_data.num_computed_tokens, + output_token_ids=[], + lora_request=new_req_data.lora_request, + ) + self.requests[req_id] = req_state + + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + if self.uses_mrope: + self._init_mrope_positions(req_state) + + reqs_to_add.append(req_state) + + # Update the states of the running/resumed requests. + is_last_rank = get_pp_group().is_last_rank + req_data = scheduler_output.scheduled_cached_reqs + for i, req_id in enumerate(req_data.req_ids): + req_state = self.requests[req_id] + num_computed_tokens = req_data.num_computed_tokens[i] + new_block_ids = req_data.new_block_ids[i] + resumed_from_preemption = req_data.resumed_from_preemption[i] + + # Update the cached states. + req_state.num_computed_tokens = num_computed_tokens + + if not is_last_rank: + # When using PP, the scheduler sends the sampled tokens back, + # because there's no direct communication between the first- + # stage worker and the last-stage worker. + new_token_ids = req_data.new_token_ids[i] + # Add the sampled token(s) from the previous step (if any). + # This doesn't include "unverified" tokens like spec tokens. + num_new_tokens = (num_computed_tokens + len(new_token_ids) - + req_state.num_tokens) + if num_new_tokens == 1: + # Avoid slicing list in most common case. + req_state.output_token_ids.append(new_token_ids[-1]) + elif num_new_tokens > 0: + req_state.output_token_ids.extend( + new_token_ids[-num_new_tokens:]) + + # Update the block IDs. + if not resumed_from_preemption: + if new_block_ids is not None: + # Append the new blocks to the existing block IDs. + for block_ids, new_ids in zip(req_state.block_ids, + new_block_ids): + block_ids.extend(new_ids) + else: + assert new_block_ids is not None + # The request is resumed from preemption. + # Replace the existing block IDs with the new ones. + req_state.block_ids = new_block_ids + + req_index = self.input_batch.req_id_to_index.get(req_id) + if req_index is None: + # The request is not in the persistent batch. + # The request was either preempted and resumed later, or was not + # scheduled in the previous step and needs to be added again. + reqs_to_add.append(req_state) + continue + + # Update the persistent batch. + self.input_batch.num_computed_tokens_cpu[req_index] = ( + num_computed_tokens) + if new_block_ids is not None: + self.input_batch.block_table.append_row( + new_block_ids, req_index) + + # For the last rank, we don't need to update the token_ids_cpu + # because the sampled tokens are already cached. + if not is_last_rank: + # Add new_token_ids to token_ids_cpu. + start_token_index = num_computed_tokens + end_token_index = num_computed_tokens + len(new_token_ids) + self.input_batch.token_ids_cpu[ + req_index, + start_token_index:end_token_index] = new_token_ids + self.input_batch.num_tokens_no_spec[ + req_index] = end_token_index + self.input_batch.num_tokens[req_index] = end_token_index + + # Add spec_token_ids to token_ids_cpu. + spec_token_ids = ( + scheduler_output.scheduled_spec_decode_tokens.get(req_id, ())) + if spec_token_ids: + num_spec_tokens = len(spec_token_ids) + start_index = self.input_batch.num_tokens_no_spec[req_index] + end_token_index = start_index + num_spec_tokens + self.input_batch.token_ids_cpu[ + req_index, start_index:end_token_index] = spec_token_ids + # NOTE(woosuk): `num_tokens` here may include spec tokens. + self.input_batch.num_tokens[req_index] += num_spec_tokens + + # Add the new or resumed requests to the persistent batch. + # The smaller empty indices are filled first. + for request in reqs_to_add: + self.input_batch.add_request(request) + + # Condense the batched states if there are gaps left by removed requests + self.input_batch.condense() + # Allow attention backend to reorder the batch, potentially + self._may_reorder_batch(scheduler_output) + # Refresh batch metadata with any pending updates. + self.input_batch.refresh_metadata() + + def _update_states_after_model_execute( + self, output_token_ids: torch.Tensor) -> None: + """Update the cached states after model execution. + + This is used for MTP/EAGLE for hybrid models, as in linear attention, + only the last token's state is kept. In MTP/EAGLE, for draft tokens + the state are kept util we decide how many tokens are accepted for + each sequence, and a shifting is done during the next iteration + based on the number of accepted tokens. + """ + if not self.model_config.is_hybrid or not self.speculative_config: + return + + # Find the number of accepted tokens for each sequence. + num_accepted_tokens = (torch.cat( + [ + output_token_ids, + torch.full((output_token_ids.size(0), 1), + -1, + device=output_token_ids.device), + ], + dim=1) == -1).int().argmax(-1).cpu().numpy() + for i, num_tokens in enumerate(num_accepted_tokens): + self.input_batch.num_accepted_tokens_cpu[i] = num_tokens + + def _init_mrope_positions(self, req_state: CachedRequestState): + image_grid_thw = [] + video_grid_thw = [] + second_per_grid_ts = [] + audio_feature_lengths = [] + use_audio_in_video = False + for mm_feature in req_state.mm_features: + mm_item = mm_feature.data + if mm_item is None: + continue + mm_input = mm_item.get_data() + if (t := mm_input.get("image_grid_thw")) is not None: + image_grid_thw.append(t.tolist()) + if (t := mm_input.get("video_grid_thw")) is not None: + video_grid_thw.append(t.tolist()) + if (t := mm_input.get("second_per_grid_ts")) is not None: + second_per_grid_ts.append(t) + if (t := mm_input.get("audio_feature_lengths")) is not None: + audio_feature_lengths.append(t) + if mm_input.get("use_audio_in_video") is True: + use_audio_in_video = True + + if supports_mrope(self.get_model()): + req_state.mrope_positions, req_state.mrope_position_delta = \ + self.model.get_mrope_input_positions( + req_state.prompt_token_ids, + hf_config=self.model_config.hf_config, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + audio_feature_lengths=audio_feature_lengths, + use_audio_in_video=use_audio_in_video, + ) + else: + req_state.mrope_positions, req_state.mrope_position_delta = \ + MRotaryEmbedding.get_input_positions_tensor( + req_state.prompt_token_ids, + hf_config=self.model_config.hf_config, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + audio_feature_lengths=audio_feature_lengths, + use_audio_in_video=use_audio_in_video, + ) + + def _extract_mm_kwargs( + self, + scheduler_output: "SchedulerOutput", + ) -> BatchedTensorInputs: + if not scheduler_output or not self.is_multimodal_raw_input_only_model: + return {} + + mm_kwargs = list[MultiModalKwargsItem]() + for req in scheduler_output.scheduled_new_reqs: + for feature in req.mm_features: + if feature.data is not None: + mm_kwargs.append(feature.data) + + # Input all modalities at once + model = cast(SupportsMultiModal, self.model) + mm_kwargs_combined: BatchedTensorInputs = {} + for _, _, mm_kwargs_group in group_mm_kwargs_by_modality( + mm_kwargs, + device=self.device, + pin_memory=self.pin_memory, + merge_by_field_config=model.merge_by_field_config, + ): + mm_kwargs_combined.update(mm_kwargs_group) + + return mm_kwargs_combined + + def _dummy_mm_kwargs(self, num_seqs: int) -> BatchedTensorInputs: + if not self.is_multimodal_raw_input_only_model: + return {} + + mm_budget = self.mm_budget + assert mm_budget is not None + + dummy_modality = mm_budget.get_modality_with_max_tokens() + return self._get_mm_dummy_batch(dummy_modality, num_seqs) + + def _get_cumsum_and_arange( + self, + num_tokens: np.ndarray, + cumsum_dtype: Optional[np.dtype] = None, + ) -> tuple[np.ndarray, np.ndarray]: + """Get the cumulative sum and batched arange of the given array. + # E.g., [2, 5, 3] -> ([2, 7, 10], [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]) + # Equivalent to but faster than: + # np.concatenate([np.arange(n) for n in num_tokens]) + """ + # Step 1. [2, 5, 3] -> [2, 7, 10] + cu_num_tokens = np.cumsum(num_tokens, dtype=cumsum_dtype) + total_num_tokens = cu_num_tokens[-1] + # Step 2. [2, 7, 10] -> [0, 0, 2, 2, 2, 2, 2, 7, 7, 7] + cumsums_offsets = np.repeat(cu_num_tokens - num_tokens, num_tokens) + # Step 3. [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + arange = self.arange_np[:total_num_tokens] - cumsums_offsets + + return cu_num_tokens, arange + + def _prepare_input_ids(self, total_num_scheduled_tokens: int, + cu_num_tokens: np.ndarray) -> None: + """Prepare the input IDs for the current batch. + + Carefully handles the `prev_sampled_token_ids` which can be cached + from the previous engine iteration, in which case those tokens on the + GPU need to be copied into the corresponding slots into input_ids.""" + + if self.input_batch.prev_sampled_token_ids is None: + # Normal scheduling case + self.input_ids.copy_to_gpu(total_num_scheduled_tokens) + if self.enable_prompt_embeds: + self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens) + self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens) + return + + # Async scheduling case, where some decode requests from the previous + # iteration won't have entries in input_ids_cpu and need to be copied + # on the GPU from prev_sampled_token_ids. + prev_req_id_to_index = self.input_batch.prev_req_id_to_index + assert prev_req_id_to_index is not None + flattened_indices = [] + prev_common_req_indices = [] + indices_match = True + max_flattened_index = -1 + for req_id, cur_index in self.input_batch.req_id_to_index.items(): + if (prev_index := prev_req_id_to_index.get(req_id)) is not None: + prev_common_req_indices.append(prev_index) + # We need to compute the flattened input_ids index of the + # last token in each common request. + flattened_index = cu_num_tokens[cur_index].item() - 1 + flattened_indices.append(flattened_index) + indices_match &= (prev_index == flattened_index) + max_flattened_index = max(max_flattened_index, flattened_index) + num_commmon_tokens = len(flattened_indices) + if num_commmon_tokens < total_num_scheduled_tokens: + # If not all requests are decodes from the last iteration, + # We need to copy the input_ids_cpu to the GPU first. + self.input_ids.copy_to_gpu(total_num_scheduled_tokens) + if self.enable_prompt_embeds: + self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens) + self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens) + if num_commmon_tokens == 0: + # No requests in common with the previous iteration + # So input_ids_cpu will have all the input ids. + return + if indices_match and max_flattened_index == (num_commmon_tokens - 1): + # Common-case optimization: the batch is unchanged + # and no reordering happened. + # The indices are both the same permutation of 0..N-1 so + # we can copy directly using a single slice. + self.input_ids.gpu[:num_commmon_tokens].copy_( + self.input_batch.prev_sampled_token_ids[:num_commmon_tokens, + 0], + non_blocking=True) + if self.enable_prompt_embeds: + self.is_token_ids.gpu[:num_commmon_tokens] = True + return + # Upload the index tensors asynchronously + # so the scatter can be non-blocking. + input_ids_index_tensor = torch.tensor(flattened_indices, + dtype=torch.int64, + pin_memory=self.pin_memory).to( + self.device, + non_blocking=True) + prev_common_req_indices_tensor = torch.tensor( + prev_common_req_indices, + dtype=torch.int64, + pin_memory=self.pin_memory).to(self.device, non_blocking=True) + self.input_ids.gpu.scatter_( + dim=0, + index=input_ids_index_tensor, + src=self.input_batch.prev_sampled_token_ids[ + prev_common_req_indices_tensor, 0]) + + def _get_encoder_seq_lens( + self, + scheduler_output: "SchedulerOutput", + kv_cache_spec: KVCacheSpec, + num_reqs: int, + ) -> Optional[np.ndarray]: + if not isinstance(kv_cache_spec, CrossAttentionSpec): + return None + + # Build encoder_seq_lens array mapping request indices to + # encoder lengths for inputs scheduled in this batch + encoder_seq_lens = np.zeros(num_reqs, dtype=np.int32) + for req_id in scheduler_output.scheduled_encoder_inputs: + req_index = self.input_batch.req_id_to_index[req_id] + encoder_seq_lens[req_index] = self.max_encoder_len + + return encoder_seq_lens + + def _prepare_inputs( + self, scheduler_output: "SchedulerOutput" + ) -> tuple[PerLayerAttnMetadata, torch.Tensor, + Optional[SpecDecodeMetadata], np.ndarray, + Optional[CommonAttentionMetadata], int, Optional[UBatchSlices], + Optional[torch.Tensor]]: + """ + :return: tuple[ + attn_metadata: layer-to-attention_metadata mapping, + logits_indices, spec_decode_metadata + ] + """ + total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens + assert total_num_scheduled_tokens > 0 + num_reqs = self.input_batch.num_reqs + assert num_reqs > 0 + + # OPTIMIZATION: Start copying the block table first. + # This way, we can overlap the copy with the following CPU operations. + self.input_batch.block_table.commit_block_table(num_reqs) + + # Get the number of scheduled tokens for each request. + req_ids = self.input_batch.req_ids + tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids] + num_scheduled_tokens = np.array(tokens, dtype=np.int32) + max_num_scheduled_tokens = max(tokens) + + # Get request indices. + # E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2] + req_indices = np.repeat(self.arange_np[:num_reqs], + num_scheduled_tokens) + + # cu_num_tokens: [2, 5, 3] -> [2, 7, 10] + # arange: [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + cu_num_tokens, arange = self._get_cumsum_and_arange( + num_scheduled_tokens) + + # Get positions. + positions_np = self.positions.np[:total_num_scheduled_tokens] + np.add(self.input_batch.num_computed_tokens_cpu[req_indices], + arange, + out=positions_np) + + # Calculate M-RoPE positions. + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + if self.uses_mrope: + self._calc_mrope_positions(scheduler_output) + + # Get token indices. + # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2] + # where M is the max_model_len. + token_indices = (positions_np + + req_indices * self.input_batch.token_ids_cpu.shape[1]) + token_indices_tensor = torch.from_numpy(token_indices) + + # NOTE(woosuk): We use torch.index_select instead of np.take here + # because torch.index_select is much faster than np.take for large + # tensors. + torch.index_select(self.input_batch.token_ids_cpu_tensor.flatten(), + 0, + token_indices_tensor, + out=self.input_ids.cpu[:total_num_scheduled_tokens]) + if self.enable_prompt_embeds: + is_token_ids = self.input_batch.is_token_ids.flatten() + torch.index_select( + is_token_ids, + 0, + token_indices_tensor, + out=self.is_token_ids.cpu[:total_num_scheduled_tokens]) + + # Because we did not pre-allocate a massive prompt_embeds CPU tensor on + # the InputBatch, we need to fill in the prompt embeds into the expected + # spots in the GpuModelRunner's pre-allocated prompt_embeds tensor. + if self.input_batch.req_prompt_embeds: + output_idx = 0 + for req_idx in range(num_reqs): + num_sched = num_scheduled_tokens[req_idx] + + # Skip if this request doesn't have embeddings + if req_idx not in self.input_batch.req_prompt_embeds: + output_idx += num_sched + continue + + # Skip if no tokens scheduled + if num_sched <= 0: + output_idx += num_sched + continue + + req_embeds = self.input_batch.req_prompt_embeds[req_idx] + start_pos = self.input_batch.num_computed_tokens_cpu[req_idx] + + # Skip if trying to read beyond available embeddings + if start_pos >= req_embeds.shape[0]: + output_idx += num_sched + continue + + # Copy available embeddings + end_pos = start_pos + num_sched + actual_end = min(end_pos, req_embeds.shape[0]) + actual_num_sched = actual_end - start_pos + + if actual_num_sched > 0: + self.inputs_embeds.cpu[output_idx:output_idx + + actual_num_sched].copy_( + req_embeds[start_pos:actual_end] + ) + + output_idx += num_sched + + self.input_batch.block_table.compute_slot_mapping( + req_indices, positions_np) + self.input_batch.block_table.commit_slot_mapping( + total_num_scheduled_tokens) + + # Prepare the attention metadata. + self.query_start_loc.np[0] = 0 + self.query_start_loc.np[1:num_reqs + 1] = cu_num_tokens + # Note: pad query_start_loc to be non-decreasing, as kernels + # like FlashAttention requires that + self.query_start_loc.np[num_reqs + 1:].fill(cu_num_tokens[-1]) + self.query_start_loc.copy_to_gpu() + query_start_loc = self.query_start_loc.gpu[:num_reqs + 1] + + num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens + num_tokens_padded = num_tokens_unpadded + self.get_local_padding( + num_tokens_unpadded) + uniform_decode = \ + (max_num_scheduled_tokens == self.uniform_decode_query_len) and \ + (total_num_scheduled_tokens == num_reqs * max_num_scheduled_tokens) + ubatch_slices, num_tokens_after_padding = \ + ubatch_split(num_scheduled_tokens, + num_tokens_unpadded, + num_tokens_padded, + uniform_decode=uniform_decode, + vllm_config=self.vllm_config) + + self.seq_lens.np[:num_reqs] = ( + self.input_batch.num_computed_tokens_cpu[:num_reqs] + + num_scheduled_tokens) + # Fill unused with 0 for full cuda graph mode. + self.seq_lens.np[num_reqs:].fill(0) + self.seq_lens.copy_to_gpu() + seq_lens = self.seq_lens.gpu[:num_reqs] + max_seq_len = self.seq_lens.np[:num_reqs].max().item() + + num_tokens = [ + self.requests[r].num_tokens for r in self.input_batch.req_ids + ] + num_tokens_np = np.array(num_tokens, dtype=np.int32) + + # Record the index of requests that should not be sampled, + # so that we could clear the sampled tokens before returning + discard_requests_mask = self.seq_lens.np[:num_reqs] < num_tokens_np + discard_request_indices = np.nonzero(discard_requests_mask)[0] + self.num_discarded_requests = len(discard_request_indices) + self.discard_request_indices.np[:self.num_discarded_requests] = ( + discard_request_indices) + + self.discard_request_indices.copy_to_gpu(self.num_discarded_requests) + + # Copy the tensors to the GPU. + self._prepare_input_ids(total_num_scheduled_tokens, cu_num_tokens) + + if self.uses_mrope: + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + self.mrope_positions.gpu[:, :total_num_scheduled_tokens].copy_( + self.mrope_positions.cpu[:, :total_num_scheduled_tokens], + non_blocking=True) + else: + # Common case (1D positions) + self.positions.copy_to_gpu(total_num_scheduled_tokens) + + use_spec_decode = len( + scheduler_output.scheduled_spec_decode_tokens) > 0 + if not use_spec_decode: + # NOTE(woosuk): Due to chunked prefills, the batch may contain + # partial requests. While we should not sample any token + # from these partial requests, we do so for simplicity. + # We will ignore the sampled tokens from the partial requests. + # TODO: Support prompt logprobs. + logits_indices = query_start_loc[1:] - 1 + num_draft_tokens = None + spec_decode_metadata = None + else: + # Get the number of draft tokens for each request. + # Iterate over the dictionary rather than all requests since not all + # requests have draft tokens. + num_draft_tokens = np.zeros(num_reqs, dtype=np.int32) + # For chunked prefills, use -1 as mask rather than 0, as guided + # decoding may rollback speculative tokens. + num_decode_draft_tokens = np.full(num_reqs, -1, dtype=np.int32) + for req_id, draft_token_ids in ( + scheduler_output.scheduled_spec_decode_tokens.items()): + req_idx = self.input_batch.req_id_to_index[req_id] + num_draft_tokens[req_idx] = len(draft_token_ids) + num_decode_draft_tokens[req_idx] = (len(draft_token_ids) if ( + self.input_batch.num_computed_tokens_cpu[req_idx] + >= self.input_batch.num_prompt_tokens[req_idx]) else -1) + spec_decode_metadata = self._calc_spec_decode_metadata( + num_draft_tokens, cu_num_tokens) + logits_indices = spec_decode_metadata.logits_indices + + # For DECODE only cuda graph of some attention backends (e.g., GDN). + self.num_decode_draft_tokens.np[: + num_reqs] = num_decode_draft_tokens + self.num_decode_draft_tokens.np[num_reqs:].fill(-1) + self.num_decode_draft_tokens.copy_to_gpu() + + logits_indices_padded = None + if self.cache_config.kv_sharing_fast_prefill: + logits_indices_padded = self._prepare_kv_sharing_fast_prefill( + logits_indices) + + attn_metadata: PerLayerAttnMetadata = {} + if ubatch_slices is not None: + attn_metadata = [dict() for _ in range(len(ubatch_slices))] + + # Used in the below loop. + query_start_loc_cpu = self.query_start_loc.cpu[:num_reqs + 1] + seq_lens_cpu = self.seq_lens.cpu[:num_reqs] + num_computed_tokens_cpu = ( + self.input_batch.num_computed_tokens_cpu_tensor[:num_reqs]) + spec_decode_common_attn_metadata = None + if use_spec_decode: + self.num_accepted_tokens.np[:num_reqs] = ( + self.input_batch.num_accepted_tokens_cpu[:num_reqs]) + self.num_accepted_tokens.np[num_reqs:].fill(1) + self.num_accepted_tokens.copy_to_gpu() + + # Prepare the attention metadata for each KV cache group and make layers + # in the same group share the same metadata. + for kv_cache_group_id, kv_cache_group_spec in enumerate( + self.kv_cache_config.kv_cache_groups): + encoder_seq_lens = self._get_encoder_seq_lens( + scheduler_output, kv_cache_group_spec.kv_cache_spec, num_reqs) + + if isinstance(kv_cache_group_spec.kv_cache_spec, + EncoderOnlyAttentionSpec): + # Encoder-only layers do not have KV cache, so we need to + # create a dummy block table and slot mapping for them. + blk_table_tensor = torch.zeros( + (num_reqs, 1), + dtype=torch.int32, + device=self.device, + ) + slot_mapping = torch.zeros( + (total_num_scheduled_tokens, ), + dtype=torch.int64, + device=self.device, + ) + num_common_prefix_blocks = 0 + else: + blk_table = self.input_batch.block_table[kv_cache_group_id] + blk_table_tensor = blk_table.get_device_tensor(num_reqs) + slot_mapping = blk_table.slot_mapping.gpu[: + total_num_scheduled_tokens] + + # Fill unused with -1. Needed for reshape_and_cache in full cuda + # graph mode. + blk_table.slot_mapping.gpu[total_num_scheduled_tokens:].fill_( + -1) + num_common_prefix_blocks = ( + scheduler_output. + num_common_prefix_blocks[kv_cache_group_id]) + + common_attn_metadata = CommonAttentionMetadata( + query_start_loc=query_start_loc, + query_start_loc_cpu=query_start_loc_cpu, + seq_lens=seq_lens, + seq_lens_cpu=seq_lens_cpu, + num_computed_tokens_cpu=num_computed_tokens_cpu, + num_reqs=num_reqs, + num_actual_tokens=total_num_scheduled_tokens, + max_query_len=max_num_scheduled_tokens, + max_seq_len=max_seq_len, + block_table_tensor=blk_table_tensor, + slot_mapping=slot_mapping, + logits_indices_padded=logits_indices_padded, + num_logits_indices=logits_indices.size(0), + causal=True, + encoder_seq_lens=encoder_seq_lens, + ) + + if (self.speculative_config + and spec_decode_common_attn_metadata is None): + if isinstance(self.drafter, EagleProposer): + if (self.drafter.attn_layer_names[0] + in kv_cache_group_spec.layer_names): + spec_decode_common_attn_metadata = common_attn_metadata + else: + spec_decode_common_attn_metadata = common_attn_metadata + + for attn_group in self.attn_groups[kv_cache_group_id]: + # Prepare for cascade attention if enabled & beneficial. + common_prefix_len = 0 + builder = attn_group.get_metadata_builder() + if self.cascade_attn_enabled: + common_prefix_len = self._compute_cascade_attn_prefix_len( + num_scheduled_tokens, + num_common_prefix_blocks, + attn_group.kv_cache_spec, + builder, + ) + + extra_attn_metadata_args = {} + if use_spec_decode and isinstance(builder, + GDNAttentionMetadataBuilder): + extra_attn_metadata_args = dict( + num_accepted_tokens=self.num_accepted_tokens. + gpu[:num_reqs], + num_decode_draft_tokens_cpu=self. + num_decode_draft_tokens.cpu[:num_reqs], + ) + + if ubatch_slices is not None: + common_attn_metadata_list = split_attn_metadata( + ubatch_slices, common_attn_metadata) + for ubid, common_attn_metadata in enumerate( + common_attn_metadata_list): + attn_metadata_i = (attn_group.get_metadata_builder( + ubatch_id=ubid).build( + common_prefix_len=common_prefix_len, + common_attn_metadata=common_attn_metadata)) + for layer_name in kv_cache_group_spec.layer_names: + assert type(attn_metadata) is list + attn_metadata[ubid][layer_name] = attn_metadata_i + else: + assert isinstance(attn_metadata, dict) + attn_metadata_i = builder.build( + common_prefix_len=common_prefix_len, + common_attn_metadata=common_attn_metadata, + **extra_attn_metadata_args) + for layer_name in attn_group.layer_names: + attn_metadata[layer_name] = attn_metadata_i + + # Hot-Swap lora model + if self.lora_config: + self.set_active_loras(self.input_batch, num_scheduled_tokens) + + return (attn_metadata, logits_indices, spec_decode_metadata, + num_scheduled_tokens, spec_decode_common_attn_metadata, + max_num_scheduled_tokens, ubatch_slices, + num_tokens_after_padding) + + def _compute_cascade_attn_prefix_len( + self, + num_scheduled_tokens: np.ndarray, + num_common_prefix_blocks: int, + kv_cache_spec: KVCacheSpec, + attn_metadata_builder: AttentionMetadataBuilder, + ) -> int: + """Compute the length of the common prefix for cascade attention. + + NOTE(woosuk): The common prefix length returned by this function + represents the length used specifically for cascade attention, not the + actual number of tokens shared between requests. When cascade attention + is disabled (use_cascade=False), this function returns 0 even if + requests share common tokens. Additionally, the common prefix length is + truncated to a multiple of the block size and may be further truncated + due to implementation details explained below. + + Args: + num_scheduled_tokens: Number of tokens scheduled per request. + num_common_prefix_blocks: Number of shared KV cache blocks. + + Returns: + int: Length of common prefix in tokens. + """ + common_prefix_len = num_common_prefix_blocks * kv_cache_spec.block_size + if common_prefix_len == 0: + # Common case. + return 0 + + # NOTE(woosuk): Cascade attention uses two attention kernels: one + # for the common prefix and the other for the rest. For the first + # kernel, we concatenate all the query tokens (possibly from + # different requests) and treat them as if they are from the same + # request. Then, we use bi-directional attention to process the + # common prefix in the KV cache. Importantly, this means that the + # first kernel does not do any masking. + + # Consider the following example: + # Request 1's input query: [D, E, X] + # Request 1's kv cache: [A, B, C, D, E, X] + # Request 1's num_computed_tokens: 3 (i.e., [A, B, C]) + # Request 2's input query: [E, Y] + # Request 2's kv cache: [A, B, C, D, E, Y] + # Request 2's num_computed_tokens: 4 (i.e., [A, B, C, D]) + + # If we use [A, B, C, D, E] as the common prefix, then the + # first kernel will compute the bi-directional attention between + # input query [D, E, X, E, Y] and common prefix [A, B, C, D, E]. + # However, this is wrong because D in Request 1 should not attend to + # E in the common prefix (i.e., we need masking). + # To avoid this, [A, B, C, D] should be the common prefix. + # That is, the common prefix should be capped by the minimum + # num_computed_tokens among the requests, and plus one to include + # the first token of the query. + + # In practice, we use [A, B, C] as the common prefix, instead of + # [A, B, C, D] (i.e., the common prefix is capped by the minimum + # num_computed_tokens, without plus one). + # This is because of an implementation detail: We want to always + # use two kernels for cascade attention. Let's imagine: + # Request 3's input query: [D] + # Request 3's kv cache: [A, B, C, D] + # Request 3's num_computed_tokens: 3 (i.e., [A, B, C]) + # If we use [A, B, C, D] as the common prefix for Request 1-3, + # then Request 3 will be processed only by the first kernel, + # and the second kernel will get an empty input. While this is not + # a fundamental problem, our current implementation does not support + # this case. + num_reqs = len(num_scheduled_tokens) + common_prefix_len = min( + common_prefix_len, + self.input_batch.num_computed_tokens_cpu[:num_reqs].min()) + # common_prefix_len should be a multiple of the block size. + common_prefix_len = (common_prefix_len // kv_cache_spec.block_size * + kv_cache_spec.block_size) + use_sliding_window = (isinstance(kv_cache_spec, SlidingWindowSpec) or + (isinstance(kv_cache_spec, FullAttentionSpec) + and kv_cache_spec.sliding_window is not None)) + use_local_attention = ( + isinstance(kv_cache_spec, ChunkedLocalAttentionSpec) + or (isinstance(kv_cache_spec, FullAttentionSpec) + and kv_cache_spec.attention_chunk_size is not None)) + assert isinstance(kv_cache_spec, AttentionSpec) + use_cascade = attn_metadata_builder.use_cascade_attention( + common_prefix_len=common_prefix_len, + query_lens=num_scheduled_tokens, + num_query_heads=self.num_query_heads, + num_kv_heads=kv_cache_spec.num_kv_heads, + use_alibi=self.use_alibi, + use_sliding_window=use_sliding_window, + use_local_attention=use_local_attention, + num_sms=self.num_sms, + ) + return common_prefix_len if use_cascade else 0 + + def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"): + mrope_pos_ptr = 0 + for index, req_id in enumerate(self.input_batch.req_ids): + req = self.requests[req_id] + assert req.mrope_positions is not None + + num_computed_tokens = \ + self.input_batch.num_computed_tokens_cpu[index] + num_scheduled_tokens = \ + scheduler_output.num_scheduled_tokens[req_id] + num_prompt_tokens = length_from_prompt_token_ids_or_embeds( + req.prompt_token_ids, req.prompt_embeds) + + if num_computed_tokens + num_scheduled_tokens > num_prompt_tokens: + prompt_part_len = max(0, + num_prompt_tokens - num_computed_tokens) + completion_part_len = max( + 0, num_scheduled_tokens - prompt_part_len) + else: + prompt_part_len = num_scheduled_tokens + completion_part_len = 0 + + assert num_scheduled_tokens == prompt_part_len + completion_part_len + + if prompt_part_len > 0: + # prompt's mrope_positions are pre-computed + dst_start = mrope_pos_ptr + dst_end = mrope_pos_ptr + prompt_part_len + src_start = num_computed_tokens + src_end = num_computed_tokens + prompt_part_len + + self.mrope_positions.cpu[:, dst_start:dst_end] = ( + req.mrope_positions[:, src_start:src_end]) + mrope_pos_ptr += prompt_part_len + + if completion_part_len > 0: + # compute completion's mrope_positions on-the-fly + dst_start = mrope_pos_ptr + dst_end = mrope_pos_ptr + completion_part_len + + MRotaryEmbedding.get_next_input_positions_tensor( + out=self.mrope_positions.np, + out_offset=dst_start, + mrope_position_delta=req.mrope_position_delta, + context_len=num_computed_tokens + prompt_part_len, + num_new_tokens=completion_part_len, + ) + + mrope_pos_ptr += completion_part_len + + def _calc_spec_decode_metadata( + self, + num_draft_tokens: np.ndarray, + cu_num_scheduled_tokens: np.ndarray, + ) -> SpecDecodeMetadata: + # Inputs: + # cu_num_scheduled_tokens: [ 4, 104, 107, 207, 209] + # num_draft_tokens: [ 3, 0, 2, 0, 1] + # Outputs: + # cu_num_draft_tokens: [ 3, 3, 5, 5, 6] + # logits_indices: [ 0, 1, 2, 3, 103, 104, 105, 106, + # 206, 207, 208] + # target_logits_indices: [ 0, 1, 2, 5, 6, 9] + # bonus_logits_indices: [ 3, 4, 7, 8, 10] + + # Compute the logits indices. + # [4, 1, 3, 1, 2] + num_sampled_tokens = num_draft_tokens + 1 + + # Step 1. cu_num_sampled_tokens: [4, 5, 8, 9, 11] + # arange: [0, 1, 2, 3, 0, 0, 1, 2, 0, 0, 1] + cu_num_sampled_tokens, arange = self._get_cumsum_and_arange( + num_sampled_tokens, cumsum_dtype=np.int32) + # Step 2. [0, 0, 0, 0, 103, 104, 104, 104, 206, 207, 207] + logits_indices = np.repeat( + cu_num_scheduled_tokens - num_sampled_tokens, num_sampled_tokens) + # Step 3. [0, 1, 2, 3, 103, 104, 105, 106, 206, 207, 208] + logits_indices += arange + + # Compute the bonus logits indices. + bonus_logits_indices = cu_num_sampled_tokens - 1 + + # Compute the draft logits indices. + # cu_num_draft_tokens: [3, 3, 5, 5, 6] + # arange: [0, 1, 2, 0, 1, 0] + cu_num_draft_tokens, arange = self._get_cumsum_and_arange( + num_draft_tokens, cumsum_dtype=np.int32) + # [0, 0, 0, 5, 5, 9] + target_logits_indices = np.repeat( + cu_num_sampled_tokens - num_sampled_tokens, num_draft_tokens) + # [0, 1, 2, 5, 6, 9] + target_logits_indices += arange + + # TODO: Optimize the CPU -> GPU copy. + cu_num_draft_tokens = torch.from_numpy(cu_num_draft_tokens).to( + self.device, non_blocking=True) + logits_indices = torch.from_numpy(logits_indices).to(self.device, + non_blocking=True) + target_logits_indices = torch.from_numpy(target_logits_indices).to( + self.device, non_blocking=True) + bonus_logits_indices = torch.from_numpy(bonus_logits_indices).to( + self.device, non_blocking=True) + + # Compute the draft token ids. + # draft_token_indices: [ 1, 2, 3, 105, 106, 208] + draft_token_ids = self.input_ids.gpu[logits_indices] + draft_token_ids = draft_token_ids[target_logits_indices + 1] + + metadata = SpecDecodeMetadata( + draft_token_ids=draft_token_ids, + num_draft_tokens=num_draft_tokens.tolist(), + cu_num_draft_tokens=cu_num_draft_tokens, + target_logits_indices=target_logits_indices, + bonus_logits_indices=bonus_logits_indices, + logits_indices=logits_indices, + ) + return metadata + + def _prepare_kv_sharing_fast_prefill( + self, + logits_indices: torch.Tensor, + ) -> torch.Tensor: + assert self.kv_sharing_fast_prefill_logits_indices is not None + num_logits = logits_indices.shape[0] + assert num_logits > 0 + self.kv_sharing_fast_prefill_logits_indices[:num_logits].copy_( + logits_indices) + # There might have leftover indices in logits_indices[num_logits:] + # from previous iterations, whose values may be greater than the + # batch size in the current iteration. To ensure indices are always + # valid, we fill the padded indices with the last index. + self.kv_sharing_fast_prefill_logits_indices[num_logits:].fill_( + logits_indices[-1].item()) + if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE + and num_logits <= self.cudagraph_batch_sizes[-1]): + # Use piecewise CUDA graphs. + # Add padding to the batch size. + num_logits_padded = self.vllm_config.pad_for_cudagraph(num_logits) + else: + num_logits_padded = num_logits + logits_indices_padded = ( + self.kv_sharing_fast_prefill_logits_indices[:num_logits_padded]) + return logits_indices_padded + + def _batch_mm_kwargs_from_scheduler( + self, + scheduler_output: "SchedulerOutput", + ) -> tuple[list[MultiModalKwargsItem], list[tuple[str, PlaceholderRange]]]: + """Batch multimodal kwargs from scheduled encoder inputs. + + Args: + scheduler_output: The scheduler output containing scheduled encoder + inputs. + + Returns: + A tuple of (mm_kwargs, req_ids_pos) where: + - mm_kwargs: List of multimodal kwargs items to be batched + - mm_hashes_pos: List of (mm_hash, position_info) tuples + """ + scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs + if not scheduled_encoder_inputs: + return [], [] + # Batch the multi-modal inputs. + mm_kwargs = list[MultiModalKwargsItem]() + # list of tuple (mm_hash, position_info) + mm_hashes_pos = list[tuple[str, PlaceholderRange]]() + for req_id, encoder_input_ids in scheduled_encoder_inputs.items(): + req_state = self.requests[req_id] + + for mm_input_id in encoder_input_ids: + mm_feature = req_state.mm_features[mm_input_id] + mm_hash = mm_feature.identifier + mm_kwargs.append(mm_feature.data) + mm_hashes_pos.append((mm_hash, mm_feature.mm_position)) + + return mm_kwargs, mm_hashes_pos + + def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"): + # Batch the multi-modal inputs using the helper method. + mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler( + scheduler_output) + + if not mm_kwargs: + return + + # Batch mm inputs as much as we can: if a request in the batch has + # multiple modalities or a different modality than the previous one, + # we process it separately to preserve item order. + # FIXME(ywang96): This is a hacky way to deal with multiple modalities + # in the same batch while still being able to benefit from batching + # multimodal inputs. The proper solution should be reordering the + # encoder outputs. + model = cast(SupportsMultiModal, self.model) + encoder_outputs = [] + for modality, num_items, mm_kwargs_group in group_mm_kwargs_by_modality( + mm_kwargs, + device=self.device, + pin_memory=self.pin_memory, + merge_by_field_config=model.merge_by_field_config, + ): + # (ekhvedchenia): Temporary hack to limit peak memory usage when + # processing multimodal data.This solves the issue with scheduler + # putting too many video samples into a single batch. Scheduler + # uses pruned vision tokens count to compare it versus compute + # budget which is incorrect (Either input media size or non-pruned + # output vision tokens count should be considered) + curr_group_outputs = [] + + if self.is_multimodal_pruning_enabled and modality == "video": + micro_batch_size = 1 + for i in range(0, num_items, micro_batch_size): + micro_batch_mm_inputs = dict( + (k, v[i:i + micro_batch_size]) + for k, v in mm_kwargs_group.items()) + + micro_batch_outputs = model.get_multimodal_embeddings( + **micro_batch_mm_inputs) + + curr_group_outputs.extend(micro_batch_outputs) + else: + # Run the encoder. + # `curr_group_outputs` is either of the following: + # 1. A tensor of shape (num_items, feature_size, hidden_size) + # in case feature_size is fixed across all multimodal items. + # 2. A list or tuple (length: num_items) of tensors, + # each of shape (feature_size, hidden_size) in case the feature + # size is dynamic depending on the input multimodal items. + curr_group_outputs = model.get_multimodal_embeddings( + **mm_kwargs_group) + + sanity_check_mm_encoder_outputs( + curr_group_outputs, + expected_num_items=num_items, + ) + encoder_outputs.extend(curr_group_outputs) + + # Cache the encoder outputs by mm_hash + for (mm_hash, pos_info), output in zip(mm_hashes_pos, encoder_outputs): + self.encoder_cache[mm_hash] = scatter_mm_placeholders( + output, + is_embed=pos_info.is_embed, + ) + + def _gather_mm_embeddings( + self, + scheduler_output: "SchedulerOutput", + shift_computed_tokens: int = 0, + ) -> tuple[list[torch.Tensor], torch.Tensor]: + total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens + + mm_embeds = list[torch.Tensor]() + is_mm_embed = self.is_mm_embed.cpu + is_mm_embed[:total_num_scheduled_tokens] = False + + req_start_idx = 0 + should_sync_mrope_positions = False + mm_embeds: list[torch.Tensor] = [] + for req_id in self.input_batch.req_ids: + mm_embeds_req: list[torch.Tensor] = [] + + num_scheduled_tokens = scheduler_output.num_scheduled_tokens[ + req_id] + req_state = self.requests[req_id] + num_computed_tokens = \ + req_state.num_computed_tokens + shift_computed_tokens + for mm_feature in req_state.mm_features: + pos_info = mm_feature.mm_position + start_pos = pos_info.offset + num_encoder_tokens = pos_info.length + + # The encoder output is needed if the two ranges overlap: + # [num_computed_tokens, + # num_computed_tokens + num_scheduled_tokens) and + # [start_pos, start_pos + num_encoder_tokens) + if start_pos >= num_computed_tokens + num_scheduled_tokens: + # The encoder output is not needed in this step. + break + if start_pos + num_encoder_tokens <= num_computed_tokens: + # The encoder output is already processed and stored + # in the decoder's KV cache. + continue + + start_idx = max(num_computed_tokens - start_pos, 0) + end_idx = min( + num_computed_tokens - start_pos + num_scheduled_tokens, + num_encoder_tokens, + ) + assert start_idx < end_idx + + mm_hash = mm_feature.identifier + encoder_output = self.encoder_cache.get(mm_hash, None) + assert encoder_output is not None,\ + f"Encoder cache miss for {mm_hash}." + + if (is_embed := pos_info.is_embed) is not None: + is_embed = is_embed[start_idx:end_idx] + + req_start_pos = req_start_idx + start_pos - num_computed_tokens + is_mm_embed[req_start_pos+start_idx:req_start_pos + end_idx] \ + = True if is_embed is None else is_embed + + mm_embeds_item = gather_mm_placeholders( + encoder_output[start_idx:end_idx], + is_embed=is_embed, + ) + mm_embeds_req.append(mm_embeds_item) + + if self.is_multimodal_pruning_enabled and self.uses_mrope: + assert req_state.mrope_positions is not None + should_sync_mrope_positions = True + mm_embeds_req, new_mrope_positions, new_delta = ( + self.model.recompute_mrope_positions( + input_ids=req_state.prompt_token_ids, + multimodal_embeddings=mm_embeds_req, + mrope_positions=req_state.mrope_positions, + num_computed_tokens=req_state.num_computed_tokens, + )) + req_state.mrope_positions.copy_(new_mrope_positions) + req_state.mrope_position_delta = new_delta + + mm_embeds.extend(mm_embeds_req) + req_start_idx += num_scheduled_tokens + + is_mm_embed = self.is_mm_embed.copy_to_gpu(total_num_scheduled_tokens) + + if should_sync_mrope_positions: + self._calc_mrope_positions(scheduler_output) + self.mrope_positions.copy_to_gpu(total_num_scheduled_tokens) + + return mm_embeds, is_mm_embed + + def _extract_encoder_inputs( + self, + scheduler_output: "SchedulerOutput", + ) -> dict[str, torch.Tensor]: + """Extract encoder inputs for encoder-decoder models. + + This method extracts multimodal input features from scheduled encoder + inputs and formats them for the encoder-decoder model forward pass. + """ + # Batch the multi-modal inputs using the helper method. + mm_kwargs, _ = self._batch_mm_kwargs_from_scheduler(scheduler_output) + + if not mm_kwargs: + return {} + + # Group MM kwargs by modality and extract features + model = cast(SupportsMultiModal, self.model) + encoder_features = {} + for _, _, mm_kwargs_group in group_mm_kwargs_by_modality( + mm_kwargs, + device=self.device, + pin_memory=self.pin_memory, + merge_by_field_config=model.merge_by_field_config, + ): + # Add the grouped features to encoder_features dict + # This allows the model to receive them as kwargs (e.g., + # input_features=...) + encoder_features.update(mm_kwargs_group) + + return encoder_features + + def get_model(self) -> nn.Module: + # get raw model out of the cudagraph wrapper. + if isinstance(self.model, (CUDAGraphWrapper, UBatchWrapper)): + return self.model.unwrap() + return self.model + + def get_supported_generation_tasks(self) -> list[GenerationTask]: + model = self.get_model() + supported_tasks = list[GenerationTask]() + + if is_text_generation_model(model): + supported_tasks.append("generate") + + if supports_transcription(model): + if model.supports_transcription_only: + return ["transcription"] + + supported_tasks.append("transcription") + + return supported_tasks + + def get_supported_pooling_tasks(self) -> list[PoolingTask]: + model = self.get_model() + if not is_pooling_model(model): + return [] + + supported_tasks = list(model.pooler.get_supported_tasks()) + + if (self.scheduler_config.chunked_prefill_enabled + and "encode" in supported_tasks): + supported_tasks.remove("encode") + + logger.debug_once("Chunked prefill is not supported with " + "encode task which using ALL pooling. " + "Please turn off chunked prefill by " + "`--no-enable-chunked-prefill` before using it.") + + if "score" in supported_tasks: + num_labels = getattr(self.model_config.hf_config, "num_labels", 0) + if num_labels != 1: + supported_tasks.remove("score") + logger.debug_once( + "Score API is only enabled for num_labels == 1.") + + return supported_tasks + + def get_supported_tasks(self) -> tuple[SupportedTask, ...]: + tasks = list[SupportedTask]() + + if self.model_config.runner_type == "generate": + tasks.extend(self.get_supported_generation_tasks()) + if self.model_config.runner_type == "pooling": + tasks.extend(self.get_supported_pooling_tasks()) + + return tuple(tasks) + + def sync_and_slice_intermediate_tensors( + self, num_tokens: int, intermediate_tensors: IntermediateTensors, + sync_self: bool) -> IntermediateTensors: + + assert self.intermediate_tensors is not None + + tp = self.vllm_config.parallel_config.tensor_parallel_size + is_rs = is_residual_scattered_for_sp(self.vllm_config, num_tokens) + + # When sequence parallelism is enabled, the "residual" tensor is sharded + # across tensor parallel ranks, so each rank only needs its own slice. + if sync_self: + assert intermediate_tensors is not None + for k, v in intermediate_tensors.items(): + is_scattered = k == "residual" and is_rs + copy_len = num_tokens // tp if is_scattered else \ + num_tokens + self.intermediate_tensors[k][:copy_len].copy_( + v[:copy_len], non_blocking=True) + + return IntermediateTensors({ + k: + v[:num_tokens // + tp] if k == "residual" and is_rs else v[:num_tokens] + for k, v in self.intermediate_tensors.items() + }) + + def eplb_step(self, + is_dummy: bool = False, + is_profile: bool = False) -> None: + """ + Step for the EPLB (Expert Parallelism Load Balancing) state. + """ + if not self.parallel_config.enable_eplb: + return + + assert self.eplb_state is not None + model = self.get_model() + assert is_mixture_of_experts(model) + self.eplb_state.step( + model, + is_dummy, + is_profile, + log_stats=self.parallel_config.eplb_config.log_balancedness, + ) + + def get_dp_padding(self, + num_tokens: int) -> tuple[int, Optional[torch.Tensor]]: + """ + Determines the total number of tokens that each rank will run. + All ranks will be padded out so that they run with the same number + of tokens + + Returns: tuple[ + num_pad_tokens: The number of tokens that will be added to the batch + num_tokens_after_padding: A tensor containing the total number of + tokens for each DP rank including padding. + ] + """ + dp_size = self.vllm_config.parallel_config.data_parallel_size + dp_rank = self.vllm_config.parallel_config.data_parallel_rank + + # For DP: Don't pad when setting enforce_eager. + # This lets us set enforce_eager on the prefiller in a P/D setup and + # still use CUDA graphs (enabled by this padding) on the decoder. + # + # TODO(tms) : There are many cases where padding is enabled for + # prefills, causing unnecessary and excessive padding of activations. + + if dp_size == 1 or self.vllm_config.model_config.enforce_eager: + # Early exit. + return 0, None + + num_tokens_across_dp = DPMetadata.num_tokens_across_dp( + num_tokens, dp_size, dp_rank) + max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp).item() + num_tokens_after_padding = torch.tensor([max_tokens_across_dp_cpu] * + dp_size, + device="cpu", + dtype=torch.int32) + return max_tokens_across_dp_cpu - num_tokens, num_tokens_after_padding + + def get_local_padding(self, num_tokens_unpadded: int) -> int: + + num_tokens_padded = num_tokens_unpadded + + if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE + and num_tokens_unpadded <= self.cudagraph_batch_sizes[-1]): + # Use piecewise CUDA graphs. + # Add padding to the batch size. + num_tokens_padded = self.vllm_config.pad_for_cudagraph( + num_tokens_unpadded) + else: + # Eager mode. + # Pad tokens to multiple of tensor_parallel_size when + # enabled collective fusion for SP + tp_size = self.vllm_config.parallel_config.tensor_parallel_size + if self.vllm_config.compilation_config.pass_config. \ + enable_sequence_parallelism and tp_size > 1: + num_tokens_padded = round_up(num_tokens_unpadded, tp_size) + + num_pad_tokens = num_tokens_padded - num_tokens_unpadded + return num_pad_tokens + + # This is where the second ubatch is adjusted to account for the padding. + # Should be called after attention metadata creation. This just pads + # the second ubatch slice out to the total number of tokens + # (num_tokens + padding) + def pad_out_ubatch_slice(self, ubatch_slices: UBatchSlices, + num_total_tokens: int): + padded_second_ubatch_slice = slice(ubatch_slices[1].token_slice.start, + num_total_tokens) + ubatch_slices[1] = UBatchSlice(padded_second_ubatch_slice, + padded_second_ubatch_slice) + + def _pool( + self, + hidden_states: torch.Tensor, + num_scheduled_tokens: int, + num_scheduled_tokens_np: np.ndarray, + ) -> ModelRunnerOutput: + assert self.input_batch.num_reqs ==\ + len(self.input_batch.pooling_params), \ + "Either all or none of the requests in" \ + " a batch must be pooling request" + + hidden_states = hidden_states[:num_scheduled_tokens] + pooling_metadata = self.input_batch.get_pooling_metadata() + pooling_metadata.build_pooling_cursor(num_scheduled_tokens_np.tolist(), + device=hidden_states.device) + seq_lens_cpu = self.seq_lens.cpu[:self.input_batch.num_reqs] + + model = cast(VllmModelForPooling, self.model) + raw_pooler_output: PoolerOutput = model.pooler( + hidden_states=hidden_states, + pooling_metadata=pooling_metadata, + ) + raw_pooler_output = json_map_leaves( + lambda x: x.to("cpu", non_blocking=True), + raw_pooler_output, + ) + self._sync_device() + + pooler_output: list[Optional[torch.Tensor]] = [] + for raw_output, seq_len, prompt_len in zip( + raw_pooler_output, seq_lens_cpu, pooling_metadata.prompt_lens): + + output = raw_output if seq_len == prompt_len else None + pooler_output.append(output) + + return ModelRunnerOutput( + req_ids=self.input_batch.req_ids, + req_id_to_index=self.input_batch.req_id_to_index, + sampled_token_ids=[], + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=pooler_output, + ) + + def _get_num_input_tokens(self, num_scheduled_tokens: int) -> int: + if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE + and not envs.VLLM_DISABLE_PAD_FOR_CUDAGRAPH + and hasattr(self, "cudagraph_batch_sizes") + and self.cudagraph_batch_sizes + and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]): + # Use CUDA graphs. + # Add padding to the batch size. + return self.vllm_config.pad_for_cudagraph(num_scheduled_tokens) + + # Eager mode. + # Pad tokens to multiple of tensor_parallel_size when + # enabled collective fusion for SP + tp_size = self.vllm_config.parallel_config.tensor_parallel_size + if (self.compilation_config.pass_config.enable_sequence_parallelism + and tp_size > 1): + return round_up(num_scheduled_tokens, tp_size) + return num_scheduled_tokens + + def _preprocess( + self, + scheduler_output: "SchedulerOutput", + intermediate_tensors: Optional[IntermediateTensors] = None, + ubatch_slices: Optional[UBatchSlices] = None, + num_tokens_after_padding: Optional[torch.Tensor] = None, + ) -> tuple[int, int, Optional[torch.Tensor], Optional[torch.Tensor], + Optional[torch.Tensor], torch.Tensor, + Optional[IntermediateTensors], dict[str, Any]]: + + num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens + if ubatch_slices: + assert num_tokens_after_padding is not None + num_input_tokens = int(num_tokens_after_padding[0].item() * 2) + self.pad_out_ubatch_slice(ubatch_slices, num_input_tokens) + elif ubatch_slices is None: + num_input_tokens = self._get_num_input_tokens(num_scheduled_tokens) + num_pad, num_tokens_after_padding = self.get_dp_padding( + num_input_tokens) + num_input_tokens += num_pad + + # _prepare_inputs may reorder the batch, so we must gather multi + # modal outputs after that to ensure the correct order + if (self.supports_mm_inputs and get_pp_group().is_first_rank + and not self.model_config.is_encoder_decoder): + # Run the multimodal encoder if any. + self._execute_mm_encoder(scheduler_output) + mm_embeds, is_mm_embed = self._gather_mm_embeddings( + scheduler_output) + + # NOTE(woosuk): To unify token ids and soft tokens (vision + # embeddings), we always use embeddings (rather than token ids) + # as input to the multimodal model, even when the input is text. + if (self.vllm_config.model_config.hf_config.model_type == + "qwen3_omni_moe"): + inputs_embeds_scheduled = self.model.get_input_embeddings( + input_ids=self.input_ids.gpu[:num_scheduled_tokens], + multimodal_embeddings=mm_embeds or None, + is_multimodal=is_mm_embed, + ) + else: + inputs_embeds_scheduled = self.model.get_input_embeddings( + input_ids=self.input_ids.gpu[:num_scheduled_tokens], + multimodal_embeddings=mm_embeds or None) + + # TODO(woosuk): Avoid the copy. Optimize. + self.inputs_embeds.gpu[:num_scheduled_tokens].copy_( + inputs_embeds_scheduled) + + input_ids = None + inputs_embeds = self.inputs_embeds.gpu[:num_input_tokens] + model_kwargs = { + **self._init_model_kwargs(num_scheduled_tokens), + **self._extract_mm_kwargs(scheduler_output), + } + elif self.enable_prompt_embeds and get_pp_group().is_first_rank: + # Get the input embeddings for the tokens that are not input embeds, + # then put them into the appropriate positions. + # TODO(qthequartermasterman): Since even when prompt embeds are + # enabled, (a) not all requests will use prompt embeds, and (b) + # after the initial prompt is processed, the rest of the generated + # tokens will be token ids, it is not desirable to have the + # embedding layer outside of the CUDA graph all the time. The v0 + # engine avoids this by "double compiling" the CUDA graph, once + # with input_ids and again with inputs_embeds, for all num_tokens. + # If a batch only has token ids, then including the embedding layer + # in the CUDA graph will be more performant (like in the else case + # below). + token_ids_idx = self.is_token_ids.gpu[:num_scheduled_tokens] \ + .nonzero(as_tuple=False) \ + .squeeze(1) + # Some tokens ids may need to become embeds + if token_ids_idx.numel() > 0: + token_ids = self.input_ids.gpu[token_ids_idx] + tokens_to_embeds = self.model.get_input_embeddings( + input_ids=token_ids) + self.inputs_embeds.gpu[token_ids_idx] = tokens_to_embeds + + inputs_embeds = self.inputs_embeds.gpu[:num_input_tokens] + model_kwargs = self._init_model_kwargs(num_input_tokens) + input_ids = None + else: + # For text-only models, we use token ids as input. + # While it is possible to use embeddings as input just like the + # multimodal models, it is not desirable for performance since + # then the embedding layer is not included in the CUDA graph. + input_ids = self.input_ids.gpu[:num_input_tokens] + inputs_embeds = None + model_kwargs = self._init_model_kwargs(num_input_tokens) + if self.uses_mrope: + positions = self.mrope_positions.gpu[:, :num_input_tokens] + else: + positions = self.positions.gpu[:num_input_tokens] + + if get_pp_group().is_first_rank: + intermediate_tensors = None + else: + intermediate_tensors = self.sync_and_slice_intermediate_tensors( + num_input_tokens, intermediate_tensors, True) + + if (self.model_config.is_encoder_decoder + and scheduler_output.scheduled_encoder_inputs): + encoder_inputs = self._extract_encoder_inputs(scheduler_output) + model_kwargs.update(encoder_inputs) + + return ( + num_scheduled_tokens, + num_input_tokens, + num_tokens_after_padding, + input_ids, + inputs_embeds, + positions, + intermediate_tensors, + model_kwargs, + ) + + def _sample( + self, logits: Optional[torch.Tensor], + spec_decode_metadata: Optional[SpecDecodeMetadata] + ) -> SamplerOutput: + # Sample the next token and get logprobs if needed. + sampling_metadata = self.input_batch.sampling_metadata + if spec_decode_metadata is None: + sampler_output = self.sampler( + logits=logits, + sampling_metadata=sampling_metadata, + ) + else: + # When indexing with a tensor (bonus_logits_indices), PyTorch + # creates a new tensor with separate storage from the original + # logits tensor. This means any in-place operations on bonus_logits + # won't affect the original logits tensor. + assert logits is not None + bonus_logits = logits[spec_decode_metadata.bonus_logits_indices] + sampler_output = self.sampler( + logits=bonus_logits, + sampling_metadata=sampling_metadata, + ) + bonus_token_ids = sampler_output.sampled_token_ids + + # Just like `bonus_logits`, `target_logits` is a new tensor with + # separate storage from the original `logits` tensor. Therefore, + # it is safe to update `target_logits` in place. + target_logits = logits[spec_decode_metadata.target_logits_indices] + output_token_ids = self.rejection_sampler( + spec_decode_metadata, + None, # draft_probs + target_logits, + bonus_token_ids, + sampling_metadata, + ) + sampler_output.sampled_token_ids = output_token_ids + self._update_states_after_model_execute(output_token_ids) + + return sampler_output + + def _bookkeeping_sync( + self, scheduler_output: "SchedulerOutput", + sampler_output: SamplerOutput, logits: Optional[torch.Tensor], + hidden_states: torch.Tensor, num_scheduled_tokens: int + ) -> tuple[ + dict[str, int], + Optional[LogprobsLists], + list[list[int]], + dict[str, Optional[LogprobsTensors]], + list[str], + dict[str, int], + list[int], + ]: + num_nans_in_logits = {} + if envs.VLLM_COMPUTE_NANS_IN_LOGITS: + num_nans_in_logits = self._get_nans_in_logits(logits) + + discard_sampled_tokens_req_indices = \ + self.discard_request_indices.np[:self.num_discarded_requests] + for i in discard_sampled_tokens_req_indices: + gen = self.input_batch.generators.get(int(i)) + if gen is not None: + gen.set_offset(gen.get_offset() - 4) + + # Copy some objects so they don't get modified after returning. + # This is important when using async scheduling. + req_ids_output_copy = self.input_batch.req_ids.copy() + req_id_to_index_output_copy = \ + self.input_batch.req_id_to_index.copy() + + # NOTE: GPU -> CPU Sync happens here. + # Move as many CPU operations as possible before this sync point. + logprobs_tensors = sampler_output.logprobs_tensors + logprobs_lists = logprobs_tensors.tolists() \ + if logprobs_tensors is not None else None + + # Compute prompt logprobs if needed. + prompt_logprobs_dict = self._get_prompt_logprobs_dict( + hidden_states[:num_scheduled_tokens], + scheduler_output.num_scheduled_tokens, + ) + + num_sampled_tokens = sampler_output.sampled_token_ids.shape[0] + sampled_token_ids = sampler_output.sampled_token_ids + invalid_req_indices = [] + if not self.use_async_scheduling: + # Get the valid generated tokens. + max_gen_len = sampled_token_ids.shape[-1] + if max_gen_len == 1: + # No spec decode tokens. + valid_sampled_token_ids = self._to_list(sampled_token_ids) + else: + # Includes spec decode tokens. + valid_sampled_token_ids = self.rejection_sampler.parse_output( + sampled_token_ids, + self.input_batch.vocab_size, + ) + # Mask out the sampled tokens that should not be sampled. + for i in discard_sampled_tokens_req_indices: + valid_sampled_token_ids[int(i)].clear() + else: + valid_sampled_token_ids = [] + invalid_req_indices = discard_sampled_tokens_req_indices.tolist() + invalid_req_indices_set = set(invalid_req_indices) + assert sampled_token_ids.shape[-1] == 1 + + # Cache the sampled tokens on the GPU and avoid CPU sync. + # These will be copied into input_ids in the next step + # when preparing inputs. + self.input_batch.prev_sampled_token_ids = \ + sampled_token_ids + self.input_batch.prev_sampled_token_ids_invalid_indices = \ + invalid_req_indices_set + self.input_batch.prev_req_id_to_index = { + req_id: i + for i, req_id in enumerate(self.input_batch.req_ids) + if i not in invalid_req_indices_set + } + + # Cache the sampled tokens in the model runner, so that the scheduler + # doesn't need to send them back. + # NOTE(woosuk): As an exception, when using PP, the scheduler sends + # the sampled tokens back, because there's no direct communication + # between the first-stage worker and the last-stage worker. + req_ids = self.input_batch.req_ids + for req_idx in range(num_sampled_tokens): + if self.use_async_scheduling: + sampled_ids = [-1] if \ + req_idx not in invalid_req_indices_set else None + else: + sampled_ids = valid_sampled_token_ids[req_idx] + if not sampled_ids: + continue + + start_idx = self.input_batch.num_tokens_no_spec[req_idx] + end_idx = start_idx + len(sampled_ids) + assert end_idx <= self.max_model_len, ( + "Sampled token IDs exceed the max model length. " + f"Total number of tokens: {end_idx} > max_model_len: " + f"{self.max_model_len}") + + self.input_batch.token_ids_cpu[req_idx, + start_idx:end_idx] = sampled_ids + self.input_batch.is_token_ids[req_idx, start_idx:end_idx] = True + self.input_batch.num_tokens_no_spec[req_idx] = end_idx + self.input_batch.num_tokens[req_idx] = end_idx + + req_id = req_ids[req_idx] + req_state = self.requests[req_id] + req_state.output_token_ids.extend(sampled_ids) + + return ( + num_nans_in_logits, + logprobs_lists, + valid_sampled_token_ids, + prompt_logprobs_dict, + req_ids_output_copy, + req_id_to_index_output_copy, + invalid_req_indices, + ) + + @contextmanager + def synchronize_input_prep(self): + if self.prepare_inputs_event is None: + yield + return + + # Ensure prior step has finished with reused CPU tensors. + # This is required in the async scheduling case because + # the CPU->GPU transfer happens async. + self.prepare_inputs_event.synchronize() + try: + yield + finally: + self.prepare_inputs_event.record() + + @torch.inference_mode() + def execute_model( + self, + scheduler_output: "SchedulerOutput", + intermediate_tensors: Optional[IntermediateTensors] = None, + ) -> Union[ModelRunnerOutput, AsyncModelRunnerOutput, IntermediateTensors]: + with record_function_or_nullcontext("Preprocess"): + with self.synchronize_input_prep(): + # Update persistent batch states. + self._update_states(scheduler_output) + + if not scheduler_output.total_num_scheduled_tokens: + if not has_kv_transfer_group(): + # Return empty ModelRunnerOutput if no work to do. + return EMPTY_MODEL_RUNNER_OUTPUT + return self.kv_connector_no_forward( + scheduler_output, self.vllm_config) + if self.cache_config.kv_sharing_fast_prefill: + assert not self.input_batch.num_prompt_logprobs, ( + "--kv-sharing-fast-prefill produces incorrect " + "logprobs for prompt tokens, tokens, please disable " + "it when the requests need prompt logprobs") + + # Prepare the decoder inputs. + (attn_metadata, logits_indices, spec_decode_metadata, + num_scheduled_tokens_np, spec_decode_common_attn_metadata, + max_query_len, ubatch_slices, num_tokens_after_padding + ) = self._prepare_inputs(scheduler_output) + + ( + num_scheduled_tokens, + num_input_tokens, + num_tokens_across_dp, + input_ids, + inputs_embeds, + positions, + intermediate_tensors, + model_kwargs, + ) = self._preprocess(scheduler_output, intermediate_tensors, + ubatch_slices, num_tokens_after_padding) + + uniform_decode = (max_query_len + == self.uniform_decode_query_len) and ( + num_scheduled_tokens + == self.input_batch.num_reqs * max_query_len) + batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens, + uniform_decode=uniform_decode) + cudagraph_runtime_mode, batch_descriptor = \ + self.cudagraph_dispatcher.dispatch(batch_descriptor) + + # This is currently to get around the assert in the DPMetadata + # where it wants `num_tokens_across_dp` to align with `num_tokens` + if ubatch_slices is not None: + num_input_tokens = ubatch_slices[0].num_tokens + + # Run the model. + # Use persistent buffers for CUDA graphs. + with (set_forward_context( + attn_metadata, + self.vllm_config, + num_tokens=num_input_tokens, + num_tokens_across_dp=num_tokens_across_dp, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=batch_descriptor, + ubatch_slices=ubatch_slices, + ), record_function_or_nullcontext("Forward"), + self.maybe_get_kv_connector_output(scheduler_output) as + kv_connector_output): + model_output = self.model( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + **model_kwargs, + ) + + with record_function_or_nullcontext("Postprocess"): + if self.use_aux_hidden_state_outputs: + # True when EAGLE 3 is used. + hidden_states, aux_hidden_states = model_output + else: + # Common case. + hidden_states = model_output + aux_hidden_states = None + + if not self.broadcast_pp_output: + # Common case. + if not get_pp_group().is_last_rank: + # Return the intermediate tensors. + assert isinstance(hidden_states, IntermediateTensors) + hidden_states.kv_connector_output = kv_connector_output + return hidden_states + + if self.is_pooling_model: + # Return the pooling output. + output = self._pool(hidden_states, num_scheduled_tokens, + num_scheduled_tokens_np) + output.kv_connector_output = kv_connector_output + return output + + sample_hidden_states = hidden_states[logits_indices] + logits = self.model.compute_logits(sample_hidden_states) + else: + # Rare case. + assert not self.is_pooling_model + + if not get_pp_group().is_last_rank: + all_gather_tensors = { + "residual": + not is_residual_scattered_for_sp( + self.vllm_config, num_input_tokens) + } + get_pp_group().send_tensor_dict( + hidden_states.tensors, + all_gather_group=get_tp_group(), + all_gather_tensors=all_gather_tensors) + logits = None + else: + sample_hidden_states = hidden_states[logits_indices] + logits = self.model.compute_logits(sample_hidden_states) + + model_output_broadcast_data = {} + if logits is not None: + model_output_broadcast_data["logits"] = logits.contiguous() + + model_output_broadcast_data = get_pp_group( + ).broadcast_tensor_dict(model_output_broadcast_data, + src=len(get_pp_group().ranks) - 1) + assert model_output_broadcast_data is not None + logits = model_output_broadcast_data["logits"] + + # Apply structured output bitmasks if present + if scheduler_output.grammar_bitmask is not None: + apply_grammar_bitmask(scheduler_output, self.input_batch, + logits, self.device) + + with record_function_or_nullcontext("Sample"): + sampler_output = self._sample(logits, spec_decode_metadata) + + def propose_draft_token_ids(sampled_token_ids): + assert spec_decode_common_attn_metadata is not None + with record_function_or_nullcontext("Draft"): + self._draft_token_ids = self.propose_draft_token_ids( + scheduler_output, + sampled_token_ids, + self.input_batch.sampling_metadata, + hidden_states, + sample_hidden_states, + aux_hidden_states, + spec_decode_metadata, + spec_decode_common_attn_metadata, + ) + + use_padded_batch_for_eagle = self.speculative_config and \ + self.speculative_config.use_eagle() and \ + not self.speculative_config.disable_padded_drafter_batch + effective_drafter_max_model_len = self.max_model_len + if effective_drafter_max_model_len is None: + effective_drafter_max_model_len = self.model_config.max_model_len + if (self.speculative_config + and self.speculative_config.draft_model_config is not None + and self.speculative_config.draft_model_config.max_model_len + is not None): + effective_drafter_max_model_len = ( + self.speculative_config.draft_model_config.max_model_len) + input_fits_in_drafter = spec_decode_common_attn_metadata and ( + spec_decode_common_attn_metadata.seq_lens.max() + + self.speculative_config.num_speculative_tokens + <= effective_drafter_max_model_len) + if use_padded_batch_for_eagle and input_fits_in_drafter: + # EAGLE speculative decoding can use the GPU sampled tokens + # as inputs, and does not need to wait for bookkeeping to finish. + propose_draft_token_ids(sampler_output.sampled_token_ids) + + with record_function_or_nullcontext("Bookkeep"): + ( + num_nans_in_logits, + logprobs_lists, + valid_sampled_token_ids, + prompt_logprobs_dict, + req_ids_output_copy, + req_id_to_index_output_copy, + invalid_req_indices, + ) = self._bookkeeping_sync(scheduler_output, sampler_output, + logits, hidden_states, + num_scheduled_tokens) + + if (self.speculative_config and not use_padded_batch_for_eagle + and input_fits_in_drafter): + # ngram and other speculative decoding methods use the sampled + # tokens on the CPU, so they are run after bookkeeping. + propose_draft_token_ids(valid_sampled_token_ids) + + with record_function_or_nullcontext("EPLB"): + self.eplb_step() + + output = ModelRunnerOutput( + req_ids=req_ids_output_copy, + req_id_to_index=req_id_to_index_output_copy, + sampled_token_ids=valid_sampled_token_ids, + logprobs=logprobs_lists, + prompt_logprobs_dict=prompt_logprobs_dict, + pooler_output=[], + kv_connector_output=kv_connector_output, + num_nans_in_logits=num_nans_in_logits, + ) + + if not self.use_async_scheduling: + return output + + return AsyncGPUModelRunnerOutput( + model_runner_output=output, + sampled_token_ids=sampler_output.sampled_token_ids, + invalid_req_indices=invalid_req_indices, + async_output_copy_stream=self.async_output_copy_stream, + ) + + def take_draft_token_ids(self) -> Optional[DraftTokenIds]: + if self._draft_token_ids is None: + return None + req_ids = self.input_batch.req_ids + if isinstance(self._draft_token_ids, torch.Tensor): + draft_token_ids = self._draft_token_ids.tolist() + else: + draft_token_ids = self._draft_token_ids + self._draft_token_ids = None + return DraftTokenIds(req_ids, draft_token_ids) + + def propose_draft_token_ids( + self, + scheduler_output: "SchedulerOutput", + sampled_token_ids: Union[torch.Tensor, list[list[int]]], + sampling_metadata: SamplingMetadata, + hidden_states: torch.Tensor, + sample_hidden_states: torch.Tensor, + aux_hidden_states: Optional[list[torch.Tensor]], + spec_decode_metadata: Optional[SpecDecodeMetadata], + common_attn_metadata: CommonAttentionMetadata, + ) -> Union[list[list[int]], torch.Tensor]: + num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens + if self.speculative_config.method == "ngram": + assert isinstance(sampled_token_ids, list) + assert isinstance(self.drafter, NgramProposer) + draft_token_ids = self.drafter.propose( + sampled_token_ids, self.input_batch.req_ids, + self.input_batch.num_tokens_no_spec, + self.input_batch.token_ids_cpu, + self.input_batch.spec_decode_unsupported_reqs) + elif self.speculative_config.method == "medusa": + assert isinstance(sampled_token_ids, list) + assert isinstance(self.drafter, MedusaProposer) + + if sample_hidden_states.shape[0] == len(sampled_token_ids): + # The input to the target model does not include draft tokens. + hidden_states = sample_hidden_states + else: + indices = [] + offset = 0 + assert spec_decode_metadata is not None + for num_draft, tokens in zip( + spec_decode_metadata.num_draft_tokens, + sampled_token_ids): + indices.append(offset + len(tokens) - 1) + offset += num_draft + 1 + indices = torch.tensor(indices, device=self.device) + hidden_states = sample_hidden_states[indices] + + draft_token_ids = self.drafter.propose( + target_hidden_states=hidden_states, + sampling_metadata=sampling_metadata, + ) + elif self.speculative_config.use_eagle(): + assert isinstance(self.drafter, EagleProposer) + + if self.speculative_config.disable_padded_drafter_batch: + # When padded-batch is disabled, the sampled_token_ids should be + # the cpu-side list[list[int]] of valid sampled tokens for each + # request, with invalid requests having empty lists. + assert isinstance(sampled_token_ids, list), \ + "sampled_token_ids should be a python list when" \ + "padded-batch is disabled." + next_token_ids = self.drafter.prepare_next_token_ids_cpu( + sampled_token_ids, self.requests, self.input_batch, + scheduler_output.num_scheduled_tokens) + else: + # When using padded-batch, the sampled_token_ids should be + # the gpu tensor of sampled tokens for each request, of shape + # (num_reqs, num_spec_tokens + 1) with rejected tokens having + # value -1. + assert isinstance(sampled_token_ids, torch.Tensor), \ + "sampled_token_ids should be a torch.Tensor when" \ + "padded-batch is enabled." + next_token_ids, valid_sampled_tokens_count = \ + self.drafter.prepare_next_token_ids_padded( + common_attn_metadata, + sampled_token_ids, + self.requests, + self.input_batch, + self.discard_request_indices.gpu, + self.num_discarded_requests + ) + + if spec_decode_metadata is None: + token_indices_to_sample = None + # input_ids can be None for multimodal models. + target_token_ids = self.input_ids.gpu[:num_scheduled_tokens] + # TODO(woosuk): Support M-RoPE. + target_positions = self.positions.gpu[:num_scheduled_tokens] + if self.use_aux_hidden_state_outputs: + assert aux_hidden_states is not None + target_hidden_states = torch.cat( + [h[:num_scheduled_tokens] for h in aux_hidden_states], + dim=-1) + else: + target_hidden_states = hidden_states[:num_scheduled_tokens] + else: + if self.speculative_config.disable_padded_drafter_batch: + token_indices_to_sample = None + common_attn_metadata, token_indices =\ + self.drafter.prepare_inputs( + common_attn_metadata, + sampled_token_ids, + spec_decode_metadata.num_draft_tokens) + else: + common_attn_metadata, token_indices, \ + token_indices_to_sample =\ + self.drafter.prepare_inputs_padded( + common_attn_metadata, + spec_decode_metadata, + valid_sampled_tokens_count) + + target_token_ids = self.input_ids.gpu[token_indices] + # TODO(woosuk): Support M-RoPE. + target_positions = self.positions.gpu[token_indices] + if self.use_aux_hidden_state_outputs: + assert aux_hidden_states is not None + target_hidden_states = torch.cat( + [h[token_indices] for h in aux_hidden_states], dim=-1) + else: + target_hidden_states = hidden_states[token_indices] + mm_embeds = None + if self.supports_mm_inputs: + mm_embeds = self._gather_mm_embeddings(scheduler_output, + shift_computed_tokens=1) + + draft_token_ids = self.drafter.propose( + target_token_ids=target_token_ids, + target_positions=target_positions, + target_hidden_states=target_hidden_states, + next_token_ids=next_token_ids, + last_token_indices=token_indices_to_sample, + sampling_metadata=sampling_metadata, + common_attn_metadata=common_attn_metadata, + mm_embeds=mm_embeds, + ) + return draft_token_ids + + def update_config(self, overrides: dict[str, Any]) -> None: + allowed_config_names = {"load_config", "model_config"} + for config_name, config_overrides in overrides.items(): + assert config_name in allowed_config_names, \ + f"Config `{config_name}` not supported. " \ + f"Allowed configs: {allowed_config_names}" + config = getattr(self, config_name) + new_config = update_config(config, config_overrides) + setattr(self, config_name, new_config) + + def load_model(self, eep_scale_up: bool = False) -> None: + """ + Args: + eep_scale_up: the model loading is for elastic EP scale up. + """ + logger.info("Starting to load model %s...", self.model_config.model) + if eep_scale_up: + from vllm.distributed.parallel_state import get_ep_group + num_local_physical_experts = torch.empty(1, + dtype=torch.int32, + device="cpu") + torch.distributed.broadcast(num_local_physical_experts, + group=get_ep_group().cpu_group, + group_src=0) + num_local_physical_experts = int(num_local_physical_experts.item()) + new_ep_size = get_ep_group().world_size + global_expert_load, old_global_expert_indices = ( + EplbState.recv_state()) + num_logical_experts = global_expert_load.shape[1] + self.parallel_config.eplb_config.num_redundant_experts = ( + num_local_physical_experts * new_ep_size - num_logical_experts) + assert old_global_expert_indices.shape[ + 1] % num_local_physical_experts == 0 + old_ep_size = old_global_expert_indices.shape[ + 1] // num_local_physical_experts + rank_mapping = { + old_ep_rank: old_ep_rank + for old_ep_rank in range(old_ep_size) + } + else: + global_expert_load = None + old_global_expert_indices = None + rank_mapping = None + + with DeviceMemoryProfiler() as m: + time_before_load = time.perf_counter() + model_loader = get_model_loader(self.load_config) + logger.info("Loading model from scratch...") + self.model = model_loader.load_model( + vllm_config=self.vllm_config, model_config=self.model_config) + if self.lora_config: + self.model = self.load_lora_model(self.model, self.vllm_config, + self.device) + if hasattr(self, "drafter"): + logger.info("Loading drafter model...") + self.drafter.load_model(self.model) + if self.use_aux_hidden_state_outputs: + if supports_eagle3(self.model): + self.model.set_aux_hidden_state_layers( + self.model.get_eagle3_aux_hidden_state_layers()) + else: + raise RuntimeError( + "Model does not support EAGLE3 interface but " + "aux_hidden_state_outputs was requested") + time_after_load = time.perf_counter() + self.model_memory_usage = m.consumed_memory + logger.info("Model loading took %.4f GiB and %.6f seconds", + self.model_memory_usage / GiB_bytes, + time_after_load - time_before_load) + prepare_communication_buffer_for_model(self.model) + + self.is_multimodal_pruning_enabled = (supports_multimodal_pruning( + self.model) and self.model_config.multimodal_config. + is_multimodal_pruning_enabled()) + + if is_mixture_of_experts( + self.model) and self.parallel_config.enable_eplb: + logger.info("EPLB is enabled for model %s.", + self.model_config.model) + self.eplb_state = EplbState.build( + self.model, + self.device, + self.parallel_config, + global_expert_load, + old_global_expert_indices, + rank_mapping, + ) + + if ( + self.vllm_config.compilation_config.level == \ + CompilationLevel.DYNAMO_AS_IS and supports_dynamo() + ): + backend = self.vllm_config.compilation_config.init_backend( + self.vllm_config) + compilation_counter.dynamo_as_is_count += 1 + self.model.compile(fullgraph=True, backend=backend) + return + # for other compilation levels, cudagraph behavior is controlled by + # CudagraphWraper and CudagraphDispatcher of vllm. + + # wrap the model with full cudagraph wrapper if needed. + if self.compilation_config.cudagraph_mode.has_full_cudagraphs() \ + and not self.parallel_config.enable_dbo: + self.model = CUDAGraphWrapper(self.model, + self.vllm_config, + runtime_mode=CUDAGraphMode.FULL) + elif self.parallel_config.enable_dbo: + if self.compilation_config.cudagraph_mode.has_full_cudagraphs(): + self.model = UBatchWrapper(self.model, self.vllm_config, + CUDAGraphMode.FULL, self.device) + else: + self.model = UBatchWrapper(self.model, self.vllm_config, + CUDAGraphMode.NONE, self.device) + + def reload_weights(self) -> None: + assert getattr(self, "model", None) is not None, \ + "Cannot reload weights before model is loaded." + model_loader = get_model_loader(self.load_config) + logger.info("Reloading weights inplace...") + model = self.get_model() + model_loader.load_weights(model, model_config=self.model_config) + + def save_tensorized_model( + self, + tensorizer_config: "TensorizerConfig", + ) -> None: + model = self.get_model() + TensorizerLoader.save_model( + model, + tensorizer_config=tensorizer_config, + model_config=self.model_config, + ) + + def _get_prompt_logprobs_dict( + self, + hidden_states: torch.Tensor, + num_scheduled_tokens: dict[str, int], + ) -> dict[str, Optional[LogprobsTensors]]: + num_prompt_logprobs_dict = self.input_batch.num_prompt_logprobs + if not num_prompt_logprobs_dict: + return {} + + in_progress_dict = self.input_batch.in_progress_prompt_logprobs_cpu + prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]] = {} + + # Since prompt logprobs are a rare feature, prioritize simple, + # maintainable loop over optimal performance. + completed_prefill_reqs = [] + for req_id, num_prompt_logprobs in num_prompt_logprobs_dict.items(): + num_tokens = num_scheduled_tokens[req_id] + + # Get metadata for this request. + request = self.requests[req_id] + if request.prompt_token_ids is None: + # Prompt logprobs is incompatible with prompt embeddings + continue + + num_prompt_tokens = len(request.prompt_token_ids) + prompt_token_ids = torch.tensor(request.prompt_token_ids).to( + self.device, non_blocking=True) + + # Set up target LogprobsTensors object. + logprobs_tensors = in_progress_dict.get(req_id) + if not logprobs_tensors: + # Create empty logprobs CPU tensors for the entire prompt. + # If chunked, we'll copy in slice by slice. + logprobs_tensors = LogprobsTensors.empty_cpu( + num_prompt_tokens - 1, num_prompt_logprobs + 1) + in_progress_dict[req_id] = logprobs_tensors + + # Determine number of logits to retrieve. + start_idx = request.num_computed_tokens + start_tok = start_idx + 1 + num_remaining_tokens = num_prompt_tokens - start_tok + if num_tokens <= num_remaining_tokens: + # This is a chunk, more tokens remain. + # In the == case, there are no more prompt logprobs to produce + # but we want to defer returning them to the next step where we + # have new generated tokens to return. + num_logits = num_tokens + else: + # This is the last chunk of prompt tokens to return. + num_logits = num_remaining_tokens + completed_prefill_reqs.append(req_id) + prompt_logprobs_dict[req_id] = logprobs_tensors + + if num_logits <= 0: + # This can happen for the final chunk if we prefilled exactly + # (num_prompt_tokens - 1) tokens for this request in the prior + # step. There are no more prompt logprobs to produce. + continue + + # Get the logits corresponding to this req's prompt tokens. + # If this is a partial request (i.e. chunked prefill), + # then there is prompt logprob generated for each index. + req_idx = self.input_batch.req_id_to_index[req_id] + offset = self.query_start_loc.np[req_idx].item() + prompt_hidden_states = hidden_states[offset:offset + num_logits] + logits = self.model.compute_logits(prompt_hidden_states) + + # Get the "target" tokens for each index. For prompt at index i, + # the token at prompt index i+1 is the "sampled" token we want + # to gather the logprob for. + tgt_token_ids = prompt_token_ids[start_tok:start_tok + num_logits] + + # Compute prompt logprobs. + logprobs = self.sampler.compute_logprobs(logits) + token_ids, logprobs, ranks = self.sampler.gather_logprobs( + logprobs, num_prompt_logprobs, tgt_token_ids) + + # Transfer GPU->CPU async. + chunk_slice = slice(start_idx, start_idx + num_logits) + logprobs_tensors.logprob_token_ids[chunk_slice].copy_( + token_ids, non_blocking=True) + logprobs_tensors.logprobs[chunk_slice].copy_(logprobs, + non_blocking=True) + logprobs_tensors.selected_token_ranks[chunk_slice].copy_( + ranks, non_blocking=True) + + # Remove requests that have completed prefill from the batch + # num_prompt_logprobs_dict. + for req_id in completed_prefill_reqs: + del num_prompt_logprobs_dict[req_id] + del in_progress_dict[req_id] + + # Must synchronize the non-blocking GPU->CPU transfers. + if prompt_logprobs_dict: + self._sync_device() + + return prompt_logprobs_dict + + def _get_nans_in_logits( + self, + logits: Optional[torch.Tensor], + ) -> dict[str, int]: + try: + if logits is None: + return {req_id: 0 for req_id in self.input_batch.req_ids} + + num_nans_in_logits = {} + num_nans_for_index = logits.isnan().sum(dim=-1).cpu().numpy() + for req_id in self.input_batch.req_ids: + req_index = self.input_batch.req_id_to_index[req_id] + num_nans_in_logits[req_id] = ( + int(num_nans_for_index[req_index]) + if num_nans_for_index is not None + and req_index < logits.shape[0] else 0) + return num_nans_in_logits + except IndexError: + return {} + + @contextmanager + def maybe_randomize_inputs(self, input_ids: torch.Tensor): + """ + Randomize input_ids if VLLM_RANDOMIZE_DP_DUMMY_INPUTS is set. + This is to help balance expert-selection + - during profile_run + - during DP rank dummy run + """ + dp_size = self.vllm_config.parallel_config.data_parallel_size + randomize_inputs = envs.VLLM_RANDOMIZE_DP_DUMMY_INPUTS and dp_size > 1 + if not randomize_inputs: + yield + else: + import functools + + @functools.cache + def rand_input_ids() -> torch.Tensor: + return torch.randint_like( + self.input_ids.gpu, + low=0, + high=self.model_config.get_vocab_size(), + dtype=input_ids.dtype) + + logger.debug_once("Randomizing dummy data for DP Rank") + input_ids.copy_(rand_input_ids()[:input_ids.size(0)], + non_blocking=True) + yield + input_ids.fill_(0) + + def _get_mm_dummy_batch( + self, + modality: str, + max_items_per_batch: int, + ) -> BatchedTensorInputs: + """Dummy data for profiling and precompiling multimodal models.""" + assert self.mm_budget is not None + + dummy_decoder_data = self.mm_registry.get_decoder_dummy_data( + model_config=self.model_config, + seq_len=self.max_model_len, + mm_counts={modality: 1}, + cache=self.mm_budget.cache, + ) + dummy_mm_data = dummy_decoder_data.multi_modal_data + + # Result in the maximum GPU consumption of the model + dummy_mm_item = dummy_mm_data[modality][0] + dummy_mm_items = [dummy_mm_item] * max_items_per_batch + + model = cast(SupportsMultiModal, self.model) + return next(mm_kwargs_group + for _, _, mm_kwargs_group in group_mm_kwargs_by_modality( + dummy_mm_items, + device=self.device, + pin_memory=self.pin_memory, + merge_by_field_config=model.merge_by_field_config, + )) + + @torch.inference_mode() + def _dummy_run( + self, + num_tokens: int, + cudagraph_runtime_mode: Optional[CUDAGraphMode] = None, + force_attention: bool = False, + uniform_decode: bool = False, + allow_microbatching: bool = True, + skip_eplb: bool = False, + is_profile: bool = False, + create_mixed_batch: bool = False, + remove_lora: bool = True, + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Run a dummy forward pass to warm up/profile run or capture the + CUDA graph for the model. + + Args: + num_tokens: Number of tokens to run the dummy forward pass. + cudagraph_runtime_mode: used to control the behavior. + - if not set will determine the cudagraph mode based on using + the self.cudagraph_dispatcher. + - CUDAGraphMode.NONE: No cudagraph, for warm up and profile run + - CUDAGraphMode.PIECEWISE: Piecewise cudagraph. + - CUDAGraphMode.FULL: Full cudagraph, attention metadata is + needed. + force_attention: If True, always create attention metadata. Used to + warm up attention backend when mode is NONE. + uniform_decode: If True, the batch is a uniform decode batch. + skip_eplb: If True, skip EPLB state update. + is_profile: If True, this is a profile run. + create_mixed_batch: If True, create a mixed batch with both decode + (1 token) and prefill (multiple tokens) requests. + remove_lora: If False, dummy LoRAs are not destroyed after the run + """ + assert cudagraph_runtime_mode is None or cudagraph_runtime_mode in { + CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL + } + + # If cudagraph_mode.decode_mode() == FULL and + # cudagraph_mode.separate_routine(). This means that we are using + # different graphs and/or modes for mixed prefill-decode batches vs. + # uniform decode batches. A uniform decode batch means that all + # requests have identical query length, except a potential virtual + # request (shorter) in the batch account for padding. + # Uniform decode batch could either be common pure decode, where + # max_query_len == 1, or speculative decode, where + # max_query_len == 1 + num_spec_decode_tokens. + + # When setting max_query_len = 1, we switch to and capture the optimized + # routine of FA2 for pure decode, i.e., Flashdecode + an optimization + # for GQA/MQA. + max_query_len = self.uniform_decode_query_len if uniform_decode else \ + num_tokens + + # Set num_scheduled_tokens based on num_tokens and max_num_seqs + # for dummy run with LoRA so that the num_reqs collectively + # has num_tokens in total. + assert num_tokens <= self.scheduler_config.max_num_batched_tokens + max_num_reqs = self.scheduler_config.max_num_seqs + if create_mixed_batch: + assert not uniform_decode + # Create mixed batch: + # first half decode tokens, second half one prefill + num_decode_tokens = num_tokens // 2 + num_prefill_tokens = num_tokens - num_decode_tokens + num_reqs = num_decode_tokens + 1 + + # Create decode requests (1 token each) followed by prefill request + num_scheduled_tokens_list = [1] * num_decode_tokens + [ + num_prefill_tokens + ] + # Note: Overriding max_query_len to be the prefill tokens + max_query_len = num_prefill_tokens + elif uniform_decode: + assert not create_mixed_batch + num_reqs = cdiv(num_tokens, max_query_len) + num_scheduled_tokens_list = [max_query_len] * num_reqs + if num_tokens % max_query_len != 0: + num_scheduled_tokens_list[-1] = num_tokens % max_query_len + else: + num_reqs = min(num_tokens, max_num_reqs) + min_tokens_per_req = num_tokens // num_reqs + num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs + num_scheduled_tokens_list[-1] += num_tokens % num_reqs + + assert sum(num_scheduled_tokens_list) == num_tokens + assert len(num_scheduled_tokens_list) == num_reqs + num_scheduled_tokens = np.array(num_scheduled_tokens_list, + dtype=np.int32) + total_num_scheduled_tokens = int(num_scheduled_tokens.sum()) + + ubatch_slices = None + num_tokens_after_padding = None + + # We currently only microbatch if the number of tokens is + # over a certain threshold. + if self.parallel_config.enable_dbo and allow_microbatching: + ubatch_slices, ubatch_num_tokens_after_padding = ubatch_split( + num_scheduled_tokens, + total_num_scheduled_tokens, + total_num_scheduled_tokens, + uniform_decode=uniform_decode, + vllm_config=self.vllm_config, + ) + # Currently when DBO is enabled `ubatch_split` returns + # the num_tokens_after_padding for a single ubatch, but we have 2 + # TODO(sage,lucas): this is cruft that should be addressed in the + # padding refactor. + if ubatch_num_tokens_after_padding is not None: + num_tokens_after_padding = ubatch_num_tokens_after_padding * 2 + + # If we failed to microbatch, currently need to resynchronize + # TODO(lucas,sage): we should be able to avoid this second sync by + # refactoring `get_dp_padding_ubatch` and `get_dp_padding` into + # a single `coordinate_batch_across_dp` function. + if num_tokens_after_padding is None: + num_pad, num_tokens_across_dp = self.get_dp_padding(num_tokens) + num_tokens_after_padding = num_tokens + num_pad + else: + num_tokens_across_dp = num_tokens_after_padding + num_tokens_after_padding = int(num_tokens_after_padding[0].item()) + + attn_metadata: Optional[PerLayerAttnMetadata] = None + + # If force_attention is True, we always capture attention. Otherwise, + # it only happens for cudagraph_runtime_mode=FULL. + if force_attention or cudagraph_runtime_mode == CUDAGraphMode.FULL: + attn_metadata = {} + if ubatch_slices is not None: + attn_metadata = [dict() for _ in range(len(ubatch_slices))] + + if create_mixed_batch: + # In the mixed batch mode (used for FI warmup), we use + # shorter sequence lengths to run faster. + # TODO(luka) better system for describing dummy batches + seq_lens = [1] * num_decode_tokens + [num_prefill_tokens + 1] + else: + seq_lens = max_query_len + self.seq_lens.np[:num_reqs] = seq_lens + self.seq_lens.np[num_reqs:] = 0 + self.seq_lens.copy_to_gpu() + + cum_num_tokens, _ = self._get_cumsum_and_arange( + num_scheduled_tokens) + self.query_start_loc.np[1:num_reqs + 1] = cum_num_tokens + self.query_start_loc.copy_to_gpu() + + for kv_cache_group_id, kv_cache_group_spec in enumerate( + self.kv_cache_config.kv_cache_groups): + common_attn_metadata = CommonAttentionMetadata( + query_start_loc=self.query_start_loc.gpu[:num_reqs + 1], + query_start_loc_cpu=self.query_start_loc.cpu[:num_reqs + + 1], + seq_lens=self.seq_lens.gpu[:num_reqs], + seq_lens_cpu=self.seq_lens.cpu[:num_reqs], + num_computed_tokens_cpu=self.input_batch. + num_computed_tokens_cpu_tensor[:num_reqs], + num_reqs=num_reqs, + num_actual_tokens=num_tokens, + max_query_len=max_query_len, + max_seq_len=self.max_model_len, + block_table_tensor=self.input_batch. + block_table[kv_cache_group_id].get_device_tensor(num_reqs), + slot_mapping=self.input_batch.block_table[ + kv_cache_group_id].slot_mapping.gpu[:num_tokens], + causal=True) + for attn_group in self.attn_groups[kv_cache_group_id]: + if ubatch_slices is not None: + common_attn_metadata_list = split_attn_metadata( + ubatch_slices, common_attn_metadata) + for ubid, common_attn_metadata in enumerate( + common_attn_metadata_list): + assert common_attn_metadata.max_query_len == 1 + attn_metadata_i = (attn_group\ + .get_metadata_builder(ubatch_id=ubid)\ + .build_for_cudagraph_capture(common_attn_metadata)) + for layer_name in attn_group.layer_names: + assert type(attn_metadata) is list + attn_metadata[ubid][ + layer_name] = attn_metadata_i + else: + assert type(attn_metadata) is dict + attn_metadata_i = attn_group.get_metadata_builder()\ + .build_for_cudagraph_capture(common_attn_metadata) + for layer_name in attn_group.layer_names: + attn_metadata[layer_name] = attn_metadata_i + + with self.maybe_dummy_run_with_lora(self.lora_config, + num_scheduled_tokens, remove_lora): + model_kwargs = self._init_model_kwargs(num_tokens) + if (self.supports_mm_inputs + and not self.model_config.is_encoder_decoder): + input_ids = None + inputs_embeds = self.inputs_embeds.gpu[:num_tokens] + model_kwargs = { + **model_kwargs, + **self._dummy_mm_kwargs(num_reqs), + } + elif self.enable_prompt_embeds: + input_ids = None + inputs_embeds = self.inputs_embeds.gpu[:num_tokens] + model_kwargs = self._init_model_kwargs(num_tokens) + else: + input_ids = self.input_ids.gpu[:num_tokens] + inputs_embeds = None + + if self.uses_mrope: + positions = self.mrope_positions.gpu[:, :num_tokens] + else: + positions = self.positions.gpu[:num_tokens] + + if get_pp_group().is_first_rank: + intermediate_tensors = None + else: + if self.intermediate_tensors is None: + self.intermediate_tensors = ( + self.model.make_empty_intermediate_tensors( + batch_size=self.max_num_tokens, + dtype=self.model_config.dtype, + device=self.device)) + + intermediate_tensors = self.sync_and_slice_intermediate_tensors( + num_tokens, None, False) + + # filter out the valid batch descriptor + _cg_mode, batch_descriptor = self.cudagraph_dispatcher.dispatch( + BatchDescriptor(num_tokens=num_tokens_after_padding, + uniform_decode=uniform_decode)) \ + if not is_profile else (CUDAGraphMode.NONE, None) + if cudagraph_runtime_mode is not None: + # we allow forcing NONE when the dispatcher disagrees to support + # warm ups for cudagraph capture + assert cudagraph_runtime_mode == CUDAGraphMode.NONE or \ + cudagraph_runtime_mode == _cg_mode, ( + f"Cudagraph runtime mode mismatch at dummy_run. " + f"Expected {_cg_mode}, but got {cudagraph_runtime_mode}.") + else: + cudagraph_runtime_mode = _cg_mode + + if ubatch_slices is not None: + # Adjust values to reflect a single ubatch. + # TODO(sage,lucas): this is cruft that should be addressed in + # the padding refactor. + num_tokens_after_padding = ubatch_slices[0].num_tokens + if num_tokens_across_dp is not None: + num_tokens_across_dp[:] = num_tokens_after_padding + + with self.maybe_randomize_inputs(input_ids), set_forward_context( + attn_metadata, + self.vllm_config, + num_tokens=num_tokens_after_padding, + num_tokens_across_dp=num_tokens_across_dp, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=batch_descriptor, + ubatch_slices=ubatch_slices): + outputs = self.model( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + **model_kwargs, + ) + + if self.use_aux_hidden_state_outputs: + hidden_states, _ = outputs + else: + hidden_states = outputs + + if self.speculative_config and self.speculative_config.use_eagle(): + assert isinstance(self.drafter, EagleProposer) + self.drafter.dummy_run(num_tokens) + + # This is necessary to avoid blocking DP. + # For dummy runs, we typically skip EPLB since we don't have any real + # requests to process. + # However, in DP settings, there may be cases when some DP ranks do + # not have any requests to process, so they're executing dummy batches. + # In such cases, we still have to trigger EPLB to make sure + # ranks execute the rearrangement in synchronization. + if not skip_eplb: + self.eplb_step(is_dummy=True, is_profile=is_profile) + + logit_indices = np.cumsum(num_scheduled_tokens) - 1 + return hidden_states, hidden_states[logit_indices] + + @torch.inference_mode() + def _dummy_sampler_run( + self, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + # The dummy hidden states may contain special values, + # like `inf` or `nan`. + # To avoid breaking the sampler, we use a random tensor here instead. + hidden_states = torch.rand_like(hidden_states) + + logits = self.model.compute_logits(hidden_states) + num_reqs = logits.size(0) + + dummy_tensors = lambda v: torch.full( + (num_reqs, ), v, device=self.device) + + dummy_metadata = SamplingMetadata( + temperature=dummy_tensors(0.5), + all_greedy=False, + all_random=False, + top_p=dummy_tensors(0.9), + top_k=dummy_tensors(logits.size(1) - 1), + generators={}, + max_num_logprobs=None, + no_penalties=True, + prompt_token_ids=None, + frequency_penalties=dummy_tensors(0.1), + presence_penalties=dummy_tensors(0.1), + repetition_penalties=dummy_tensors(0.1), + output_token_ids=[[] for _ in range(num_reqs)], + allowed_token_ids_mask=None, + bad_words_token_ids={}, + logitsprocs=LogitsProcessors(), + ) + try: + sampler_output = self.sampler(logits=logits, + sampling_metadata=dummy_metadata) + except RuntimeError as e: + if 'out of memory' in str(e): + raise RuntimeError( + "CUDA out of memory occurred when warming up sampler with " + f"{num_reqs} dummy requests. Please try lowering " + "`max_num_seqs` or `gpu_memory_utilization` when " + "initializing the engine.") from e + else: + raise e + if self.speculative_config: + draft_token_ids = [[0] for _ in range(num_reqs)] + dummy_spec_decode_metadata = SpecDecodeMetadata.make_dummy( + draft_token_ids, self.device) + + num_tokens = sum(len(ids) for ids in draft_token_ids) + # draft_probs = torch.randn( + # num_tokens, logits.shape[-1], device=self.device, + # dtype=logits.dtype) + draft_probs = None + target_logits = torch.randn(num_tokens, + logits.shape[-1], + device=self.device, + dtype=logits.dtype) + # NOTE(woosuk): Here, we should use int32 because the sampler uses + # int32 for bonus_token_ids. If the dtype mismatches, re-compilation + # will occur at runtime. + bonus_token_ids = torch.zeros(num_reqs, + device=self.device, + dtype=torch.int32) + self.rejection_sampler( + dummy_spec_decode_metadata, + draft_probs, + target_logits, + bonus_token_ids, + dummy_metadata, + ) + return sampler_output + + def _dummy_pooler_run_task( + self, + hidden_states: torch.Tensor, + task: PoolingTask, + ) -> PoolerOutput: + num_tokens = hidden_states.shape[0] + max_num_reqs = self.scheduler_config.max_num_seqs + num_reqs = min(num_tokens, max_num_reqs) + min_tokens_per_req = num_tokens // num_reqs + num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs + num_scheduled_tokens_list[-1] += num_tokens % num_reqs + assert sum(num_scheduled_tokens_list) == num_tokens + assert len(num_scheduled_tokens_list) == num_reqs + + req_num_tokens = num_tokens // num_reqs + + dummy_prompt_lens = torch.tensor( + num_scheduled_tokens_list, + device="cpu", + ) + dummy_token_ids = torch.zeros((num_reqs, req_num_tokens), + dtype=torch.int32, + device=self.device) + + model = cast(VllmModelForPooling, self.get_model()) + dummy_pooling_params = PoolingParams(task=task) + dummy_pooling_params.verify(task=task, model_config=self.model_config) + to_update = model.pooler.get_pooling_updates(task) + to_update.apply(dummy_pooling_params) + + dummy_metadata = PoolingMetadata( + prompt_lens=dummy_prompt_lens, + prompt_token_ids=dummy_token_ids, + pooling_params=[dummy_pooling_params] * num_reqs, + ) + + dummy_metadata.build_pooling_cursor(num_scheduled_tokens_list, + device=hidden_states.device) + + try: + return model.pooler(hidden_states=hidden_states, + pooling_metadata=dummy_metadata) + except RuntimeError as e: + if 'out of memory' in str(e): + raise RuntimeError( + "CUDA out of memory occurred when warming up pooler " + f"({task=}) with {num_reqs} dummy requests. Please try " + "lowering `max_num_seqs` or `gpu_memory_utilization` when " + "initializing the engine.") from e + else: + raise e + + @torch.inference_mode() + def _dummy_pooler_run( + self, + hidden_states: torch.Tensor, + ) -> PoolerOutput: + # Find the task that has the largest output for subsequent steps + output_size = dict[PoolingTask, float]() + for task in self.get_supported_pooling_tasks(): + # Run a full batch with each task to ensure none of them OOMs + output = self._dummy_pooler_run_task(hidden_states, task) + output_size[task] = sum(o.nbytes for o in output) + del output # Allow GC + + max_task = max(output_size.items(), key=lambda x: x[1])[0] + return self._dummy_pooler_run_task(hidden_states, max_task) + + def profile_run(self) -> None: + # Profile with multimodal encoder & encoder cache. + if self.supports_mm_inputs: + if self.model_config.multimodal_config.skip_mm_profiling: + logger.info( + "Skipping memory profiling for multimodal encoder and " + "encoder cache.") + else: + mm_budget = self.mm_budget + assert mm_budget is not None + + if (encoder_budget := mm_budget.get_encoder_budget()) > 0: + # NOTE: Currently model is profiled with a single non-text + # modality with the max possible input tokens even when + # it supports multiple. + dummy_modality = mm_budget.get_modality_with_max_tokens() + max_mm_items_per_batch = mm_budget \ + .max_items_per_batch_by_modality[dummy_modality] + + logger.info( + "Encoder cache will be initialized with a budget of " + "%s tokens, and profiled with %s %s items of the " + "maximum feature size.", + encoder_budget, + max_mm_items_per_batch, + dummy_modality, + ) + + # Create dummy batch of multimodal inputs. + batched_dummy_mm_inputs = self._get_mm_dummy_batch( + dummy_modality, + max_mm_items_per_batch, + ) + + # Run multimodal encoder. + dummy_encoder_outputs = \ + self.model.get_multimodal_embeddings( + **batched_dummy_mm_inputs) + + sanity_check_mm_encoder_outputs( + dummy_encoder_outputs, + expected_num_items=max_mm_items_per_batch, + ) + + # NOTE: This happens when encoder cache needs to store + # the embeddings that encoder outputs are scattered onto. + # In this case we create dummy embeddings of size + # (encode_budget, hidden_size) and scatter encoder + # output into it. + encoder_output_shape = dummy_encoder_outputs[0].shape + if encoder_output_shape[0] < encoder_budget: + expanded_outputs = [] + for output in dummy_encoder_outputs: + expanded = output.new_zeros( + (encoder_budget, encoder_output_shape[-1])) + num_tokens = output.shape[0] + expanded[:num_tokens].copy_(output) + expanded_outputs.append(expanded) + + dummy_encoder_outputs = expanded_outputs + + # Cache the dummy encoder outputs. + self.encoder_cache["tmp"] = dict( + enumerate(dummy_encoder_outputs)) + + # Add `is_profile` here to pre-allocate communication buffers + hidden_states, last_hidden_states \ + = self._dummy_run(self.max_num_tokens, is_profile=True) + if get_pp_group().is_last_rank: + if self.is_pooling_model: + output = self._dummy_pooler_run(hidden_states) + else: + output = self._dummy_sampler_run(last_hidden_states) + else: + output = None + self._sync_device() + del hidden_states, output + self.encoder_cache.clear() + gc.collect() + + def capture_model(self) -> int: + if self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE: + logger.warning( + "Skipping CUDA graph capture. To turn on CUDA graph capture, " + "ensure `cudagraph_mode` was not manually set to `NONE`") + return 0 + else: + self.initialize_cudagraph_capture() + + compilation_counter.num_gpu_runner_capture_triggers += 1 + + start_time = time.perf_counter() + start_free_gpu_memory = torch.cuda.mem_get_info()[0] + + @contextmanager + def freeze_gc(): + # Optimize garbage collection during CUDA graph capture. + # Clean up, then freeze all remaining objects from being included + # in future collections. + gc.collect() + should_freeze = not envs.VLLM_ENABLE_CUDAGRAPH_GC + if should_freeze: + gc.freeze() + try: + yield + finally: + if should_freeze: + gc.unfreeze() + gc.collect() + + # Trigger CUDA graph capture for specific shapes. + # Capture the large shapes first so that the smaller shapes + # can reuse the memory pool allocated for the large shapes. + set_cudagraph_capturing_enabled(True) + with freeze_gc(), graph_capture(device=self.device): + cudagraph_mode = self.compilation_config.cudagraph_mode + assert cudagraph_mode is not None + if cudagraph_mode.mixed_mode() != CUDAGraphMode.NONE: + cudagraph_runtime_mode = cudagraph_mode.mixed_mode() + + compilation_cases = list(reversed(self.cudagraph_batch_sizes)) + self._capture_cudagraphs( + compilation_cases, + cudagraph_runtime_mode=cudagraph_runtime_mode, + uniform_decode=False) + + # Capture full cudagraph for uniform decode batches if we + # don't already have full mixed prefill-decode cudagraphs. + if cudagraph_mode.decode_mode() == CUDAGraphMode.FULL and \ + cudagraph_mode.separate_routine(): + max_num_tokens = self.scheduler_config.max_num_seqs * \ + self.uniform_decode_query_len + decode_cudagraph_batch_sizes = [ + x for x in self.cudagraph_batch_sizes if + x <= max_num_tokens and x >= self.uniform_decode_query_len + ] + compilation_cases_decode = list( + reversed(decode_cudagraph_batch_sizes)) + self._capture_cudagraphs( + compilation_cases=compilation_cases_decode, + cudagraph_runtime_mode=CUDAGraphMode.FULL, + uniform_decode=True) + + # Disable cudagraph capturing globally, so any unexpected cudagraph + # capturing will be detected and raise an error after here. + # Note: We don't put it into graph_capture context manager because + # we may do lazy capturing in future that still allows capturing + # after here. + set_cudagraph_capturing_enabled(False) + + end_time = time.perf_counter() + end_free_gpu_memory = torch.cuda.mem_get_info()[0] + elapsed_time = end_time - start_time + cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory + # This usually takes 5~20 seconds. + logger.info("Graph capturing finished in %.0f secs, took %.2f GiB", + elapsed_time, cuda_graph_size / (1 << 30)) + return cuda_graph_size + + def _capture_cudagraphs(self, compilation_cases: list[int], + cudagraph_runtime_mode: CUDAGraphMode, + uniform_decode: bool): + assert cudagraph_runtime_mode != CUDAGraphMode.NONE and \ + cudagraph_runtime_mode in [CUDAGraphMode.FULL, + CUDAGraphMode.PIECEWISE] + + # Only rank 0 should print progress bar during capture + if is_global_first_rank(): + compilation_cases = tqdm( + compilation_cases, + disable=not self.load_config.use_tqdm_on_load, + desc="Capturing CUDA graphs ({}, {})".format( + "decode" if uniform_decode else "mixed prefill-decode", + cudagraph_runtime_mode.name)) + + # We skip EPLB here since we don't want to record dummy metrics + for num_tokens in compilation_cases: + # We currently only capture ubatched graphs when its a FULL + # cudagraph, a uniform decode batch, and the number of tokens + # is above the threshold. Otherwise we just capture a non-ubatched + # version of the graph + allow_microbatching = self.parallel_config.enable_dbo \ + and cudagraph_runtime_mode == CUDAGraphMode.FULL \ + and uniform_decode \ + and check_ubatch_thresholds( + config=self.vllm_config.parallel_config, + num_tokens=num_tokens, + uniform_decode=uniform_decode, + ) + + for _ in range(self.compilation_config.cudagraph_num_of_warmups): + # Use CUDAGraphRuntimeStyle.NONE (default) for warmup. + # But be careful, warm up with `NONE`is orthogonal to + # if we want to warm up attention or not. This is + # different from the case where `FULL` implies capture + # attention while `PIECEWISE` implies no attention. + force_attention = ( + cudagraph_runtime_mode == CUDAGraphMode.FULL) + self._dummy_run(num_tokens, + cudagraph_runtime_mode=CUDAGraphMode.NONE, + force_attention=force_attention, + uniform_decode=uniform_decode, + allow_microbatching=allow_microbatching, + skip_eplb=True, + remove_lora=False) + self._dummy_run(num_tokens, + cudagraph_runtime_mode=cudagraph_runtime_mode, + uniform_decode=uniform_decode, + allow_microbatching=allow_microbatching, + skip_eplb=True, + remove_lora=False) + self.maybe_remove_all_loras(self.lora_config) + + def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None: + """ + Initialize the attention backends and attention metadata builders. + """ + assert len(self.attn_groups) == 0, \ + "Attention backends are already initialized" + + class AttentionGroupKey(NamedTuple): + attn_backend: type[AttentionBackend] + kv_cache_spec: KVCacheSpec + + def get_attn_backends_for_group( + kv_cache_group_spec: KVCacheGroupSpec, + ) -> dict[AttentionGroupKey, list[str]]: + layers = get_layers_from_vllm_config( + self.vllm_config, AttentionLayerBase, + kv_cache_group_spec.layer_names) + attn_backends = {} + attn_backend_layers = defaultdict(list) + # Dedupe based on full class name; this is a bit safer than + # using the class itself as the key because when we create dynamic + # attention backend subclasses (e.g. ChunkedLocalAttention) unless + # they are cached correctly, there will be different objects per + # layer. + for layer_name in kv_cache_group_spec.layer_names: + attn_backend = layers[layer_name].get_attn_backend() + + if layer_name in self.kv_sharing_fast_prefill_eligible_layers: + attn_backend = create_fast_prefill_custom_backend( + "FastPrefill", + attn_backend, + ) + + full_cls_name = attn_backend.full_cls_name() + layer_kv_cache_spec = kv_cache_group_spec.kv_cache_spec + if isinstance(layer_kv_cache_spec, UniformTypeKVCacheSpecs): + layer_kv_cache_spec = layer_kv_cache_spec.kv_cache_specs[ + layer_name] + key = (full_cls_name, layer_kv_cache_spec) + attn_backends[key] = AttentionGroupKey(attn_backend, + layer_kv_cache_spec) + attn_backend_layers[key].append(layer_name) + return { + attn_backends[k]: v + for k, v in attn_backend_layers.items() + } + + def create_attn_groups( + attn_backends_map: dict[AttentionGroupKey, list[str]], + ) -> list[AttentionGroup]: + attn_groups: list[AttentionGroup] = [] + for (attn_backend, + kv_cache_spec), layer_names in attn_backends_map.items(): + attn_group = AttentionGroup.create_with_metadata_builders( + attn_backend, + layer_names, + kv_cache_spec, + self.vllm_config, + self.device, + num_metadata_builders=1 + if not self.parallel_config.enable_dbo else 2, + ) + + attn_groups.append(attn_group) + return attn_groups + + for kv_cache_group_spec in kv_cache_config.kv_cache_groups: + attn_backends = get_attn_backends_for_group(kv_cache_group_spec) + self.attn_groups.append(create_attn_groups(attn_backends)) + + # Calculate reorder batch threshold (if needed) + self.calculate_reorder_batch_threshold() + + def initialize_cudagraph_capture(self) -> None: + min_cg_support = AttentionCGSupport.ALWAYS + min_cg_builder_name = None + + for attn_group in self._attn_group_iterator(): + builder = attn_group.get_metadata_builder() + if builder.cudagraph_support.value < min_cg_support.value: + min_cg_support = builder.cudagraph_support + min_cg_builder_name = builder.__class__.__name__ + # Flexible resolve the cudagraph mode + cudagraph_mode = self.compilation_config.cudagraph_mode + # check cudagraph for mixed batch is supported + if cudagraph_mode.mixed_mode() == CUDAGraphMode.FULL \ + and min_cg_support != AttentionCGSupport.ALWAYS: + msg = (f"CUDAGraphMode.{cudagraph_mode.name} is not supported " + f"with {min_cg_builder_name} backend (support: " + f"{min_cg_support})") + if min_cg_support == AttentionCGSupport.NEVER: + # if not supported any full cudagraphs, just raise it. + msg += "; please try cudagraph_mode=PIECEWISE, and "\ + "make sure compilation level is piecewise" + raise ValueError(msg) + + # attempt to resolve the full cudagraph related mode + if self.compilation_config.splitting_ops_contain_attention(): + msg += "; setting cudagraph_mode=FULL_AND_PIECEWISE" + cudagraph_mode = self.compilation_config.cudagraph_mode = \ + CUDAGraphMode.FULL_AND_PIECEWISE + else: + msg += "; setting cudagraph_mode=FULL_DECODE_ONLY" + cudagraph_mode = self.compilation_config.cudagraph_mode = \ + CUDAGraphMode.FULL_DECODE_ONLY + logger.warning(msg) + + # check that if we are doing decode full-cudagraphs it is supported + if (cudagraph_mode.decode_mode() == CUDAGraphMode.FULL + and min_cg_support == AttentionCGSupport.NEVER): + msg = (f"CUDAGraphMode.{cudagraph_mode.name} is not supported " + f"with {min_cg_builder_name} backend (support: " + f"{min_cg_support})") + if (self.compilation_config.level == CompilationLevel.PIECEWISE and + (self.compilation_config.splitting_ops_contain_attention() + or self.compilation_config.use_inductor_graph_partition)): + msg += "; setting cudagraph_mode=PIECEWISE because "\ + "attention is compiled piecewise" + cudagraph_mode = self.compilation_config.cudagraph_mode = \ + CUDAGraphMode.PIECEWISE + else: + msg += "; setting cudagraph_mode=NONE because "\ + "attention is not compiled piecewise" + cudagraph_mode = self.compilation_config.cudagraph_mode = \ + CUDAGraphMode.NONE + logger.warning(msg) + + # check that if we are doing spec-decode + decode full-cudagraphs it is + # supported + if (cudagraph_mode.decode_mode() == CUDAGraphMode.FULL + and self.uniform_decode_query_len > 1 and min_cg_support.value + < AttentionCGSupport.UNIFORM_BATCH.value): + msg = (f"CUDAGraphMode.{cudagraph_mode.name} is not supported" + f" with spec-decode for attention backend " + f"{min_cg_builder_name} (support: {min_cg_support})") + if self.compilation_config.splitting_ops_contain_attention(): + msg += "; setting cudagraph_mode=PIECEWISE" + cudagraph_mode = self.compilation_config.cudagraph_mode = \ + CUDAGraphMode.PIECEWISE + else: + msg += "; setting cudagraph_mode=NONE" + cudagraph_mode = self.compilation_config.cudagraph_mode = \ + CUDAGraphMode.NONE + logger.warning(msg) + + # double check that we can support full cudagraph if they are requested + # even after automatic downgrades + if cudagraph_mode.has_full_cudagraphs() \ + and min_cg_support == AttentionCGSupport.NEVER: + raise ValueError(f"CUDAGraphMode.{cudagraph_mode.name} is not " + f"supported with {min_cg_builder_name} backend (" + f"support:{min_cg_support}) " + "; please try cudagraph_mode=PIECEWISE, " + "and make sure compilation level is piecewise") + + # Trigger cudagraph dispatching keys initialization here (after + # initializing attn backends). + self.cudagraph_dispatcher.initialize_cudagraph_keys( + self.compilation_config.cudagraph_mode, + self.uniform_decode_query_len) + + def calculate_reorder_batch_threshold(self) -> None: + """ + Check that if any backends reorder batches; that the reordering + is compatible (e.g., decode threshold is the same) + """ + for group in self._attn_group_iterator(): + attn_metadata_builder_i = group.get_metadata_builder() + + # check that if any backends reorder batches; that the reordering + # is compatible (e.g., decode threshold is the same) + reorder_batch_threshold_i = ( + attn_metadata_builder_i.reorder_batch_threshold) + if reorder_batch_threshold_i is not None: + if self.reorder_batch_threshold is not None: + if reorder_batch_threshold_i != \ + self.reorder_batch_threshold: + raise ValueError( + f"Attention backend reorders decodes with " + f"threshold {reorder_batch_threshold_i} but other " + f"backend uses threshold " + f"{self.reorder_batch_threshold}") + else: + self.reorder_batch_threshold = reorder_batch_threshold_i + + def may_reinitialize_input_batch(self, + kv_cache_config: KVCacheConfig) -> None: + """ + Re-initialize the input batch if the block sizes are different from + `[self.cache_config.block_size]`. This usually happens when there + are multiple KV cache groups. + + Args: + kv_cache_config: The KV cache configuration. + """ + block_sizes = [ + kv_cache_group.kv_cache_spec.block_size + for kv_cache_group in kv_cache_config.kv_cache_groups + ] + if block_sizes != [self.cache_config.block_size]: + assert self.cache_config.cpu_offload_gb == 0, ( + "Cannot re-initialize the input batch when CPU weight " + "offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501 + "for more details.") + self.input_batch = InputBatch( + max_num_reqs=self.max_num_reqs, + max_model_len=max(self.max_model_len, self.max_encoder_len), + max_num_batched_tokens=self.max_num_tokens, + device=self.device, + pin_memory=self.pin_memory, + vocab_size=self.model_config.get_vocab_size(), + block_sizes=block_sizes, + is_spec_decode=bool(self.vllm_config.speculative_config), + logitsprocs=self.input_batch.logitsprocs, + is_pooling_model=self.is_pooling_model, + num_speculative_tokens=( + self.vllm_config.speculative_config.num_speculative_tokens + if self.vllm_config.speculative_config else 0), + ) + + def _allocate_kv_cache_tensors( + self, kv_cache_config: KVCacheConfig) -> dict[str, torch.Tensor]: + """ + Initializes the KV cache buffer with the correct size. The buffer needs + to be reshaped to the desired shape before being used by the models. + + Args: + kv_cache_config: The KV cache config + Returns: + dict[str, torch.Tensor]: A map between layer names to their + corresponding memory buffer for KV cache. + """ + kv_cache_raw_tensors: dict[str, torch.Tensor] = {} + for kv_cache_tensor in kv_cache_config.kv_cache_tensors: + tensor = torch.zeros(kv_cache_tensor.size, + dtype=torch.int8, + device=self.device) + for layer_name in kv_cache_tensor.shared_by: + kv_cache_raw_tensors[layer_name] = tensor + + layer_names = set() + for group in kv_cache_config.kv_cache_groups: + for layer_name in group.layer_names: + if layer_name in self.runner_only_attn_layers: + continue + layer_names.add(layer_name) + assert layer_names == set(kv_cache_raw_tensors.keys( + )), "Some layers are not correctly initialized" + return kv_cache_raw_tensors + + def _attn_group_iterator(self) -> Iterator[AttentionGroup]: + return itertools.chain.from_iterable(self.attn_groups) + + def _kv_cache_spec_attn_group_iterator(self) -> Iterator[AttentionGroup]: + if not self.kv_cache_config.kv_cache_groups: + return + for attn_groups in self.attn_groups: + yield from attn_groups + + def _reshape_kv_cache_tensors( + self, + kv_cache_config: KVCacheConfig, + kv_cache_raw_tensors: dict[str, torch.Tensor], + ) -> dict[str, torch.Tensor]: + """ + Reshape the KV cache tensors to the desired shape and dtype. + + Args: + kv_cache_config: The KV cache config + kv_cache_raw_tensors: The KV cache buffer of each layer, with + correct size but uninitialized shape. + Returns: + Dict[str, torch.Tensor]: A map between layer names to their + corresponding memory buffer for KV cache. + """ + kv_caches: dict[str, torch.Tensor] = {} + has_attn, has_mamba = False, False + for group in self._kv_cache_spec_attn_group_iterator(): + kv_cache_spec = group.kv_cache_spec + attn_backend = group.backend + for layer_name in group.layer_names: + if layer_name in self.runner_only_attn_layers: + continue + raw_tensor = kv_cache_raw_tensors[layer_name] + assert raw_tensor.numel() % kv_cache_spec.page_size_bytes == 0 + num_blocks = (raw_tensor.numel() // + kv_cache_spec.page_size_bytes) + if isinstance(kv_cache_spec, AttentionSpec): + has_attn = True + kv_cache_shape = attn_backend.get_kv_cache_shape( + num_blocks, + kv_cache_spec.block_size, + kv_cache_spec.num_kv_heads, + kv_cache_spec.head_size, + cache_dtype_str=self.cache_config.cache_dtype) + dtype = kv_cache_spec.dtype + try: + kv_cache_stride_order = \ + attn_backend.get_kv_cache_stride_order() + assert len(kv_cache_stride_order) == len( + kv_cache_shape) + except (AttributeError, NotImplementedError): + kv_cache_stride_order = tuple( + range(len(kv_cache_shape))) + # The allocation respects the backend-defined stride order + # to ensure the semantic remains consistent for each + # backend. We first obtain the generic kv cache shape and + # then permute it according to the stride order which could + # result in a non-contiguous tensor. + kv_cache_shape = tuple(kv_cache_shape[i] + for i in kv_cache_stride_order) + # Maintain original KV shape view. + inv_order = [ + kv_cache_stride_order.index(i) + for i in range(len(kv_cache_stride_order)) + ] + kv_caches[layer_name] = kv_cache_raw_tensors[ + layer_name].view(dtype).view(kv_cache_shape).permute( + *inv_order) + elif isinstance(kv_cache_spec, MambaSpec): + has_mamba = True + raw_tensor = kv_cache_raw_tensors[layer_name] + state_tensors = [] + storage_offset_bytes = 0 + for (shape, dtype) in zip(kv_cache_spec.shapes, + kv_cache_spec.dtypes): + dtype_size = get_dtype_size(dtype) + num_element_per_page = ( + kv_cache_spec.page_size_bytes // dtype_size) + target_shape = (num_blocks, *shape) + stride = torch.empty(target_shape).stride() + target_stride = (num_element_per_page, *stride[1:]) + assert storage_offset_bytes % dtype_size == 0 + tensor = torch.as_strided( + raw_tensor.view(dtype), + size=target_shape, + stride=target_stride, + storage_offset=storage_offset_bytes // dtype_size, + ) + state_tensors.append(tensor) + storage_offset_bytes += stride[0] * dtype_size + + kv_caches[layer_name] = state_tensors + else: + raise NotImplementedError + + if has_attn and has_mamba: + self._update_hybrid_attention_mamba_layout(kv_caches) + + return kv_caches + + def _update_hybrid_attention_mamba_layout( + self, kv_caches: dict[str, torch.Tensor]) -> None: + """ + Update the layout of attention layers from (2, num_blocks, ...) to + (num_blocks, 2, ...). + + Args: + kv_caches: The KV cache buffer of each layer. + """ + + for group in self._kv_cache_spec_attn_group_iterator(): + kv_cache_spec = group.kv_cache_spec + for layer_name in group.layer_names: + kv_cache = kv_caches[layer_name] + if (isinstance(kv_cache_spec, AttentionSpec) + and kv_cache.shape[0] == 2): + assert kv_cache.shape[1] != 2, \ + "Fail to determine whether the layout is " \ + "(2, num_blocks, ...) or (num_blocks, 2, ...) for " \ + f"a tensor of shape {kv_cache.shape}" + hidden_size = kv_cache.shape[2:].numel() + kv_cache.as_strided_(size=kv_cache.shape, + stride=(hidden_size, 2 * hidden_size, + *kv_cache.stride()[2:])) + + def initialize_kv_cache_tensors( + self, kv_cache_config: KVCacheConfig) -> dict[str, torch.Tensor]: + """ + Initialize the memory buffer for KV cache. + + Args: + kv_cache_config: The KV cache config + Returns: + Dict[str, torch.Tensor]: A map between layer names to their + corresponding memory buffer for KV cache. + """ + # Initialize the memory buffer for KV cache + kv_cache_raw_tensors = self._allocate_kv_cache_tensors(kv_cache_config) + # Change the memory buffer to the desired shape + kv_caches = self._reshape_kv_cache_tensors(kv_cache_config, + kv_cache_raw_tensors) + + # Set up cross-layer KV cache sharing + for layer_name, target_layer_name in self.shared_kv_cache_layers.items( + ): + logger.debug("%s reuses KV cache of %s", layer_name, + target_layer_name) + kv_caches[layer_name] = kv_caches[target_layer_name] + + num_attn_module = 2 \ + if self.model_config.hf_config.model_type == "longcat_flash" else 1 + bind_kv_cache(kv_caches, + self.compilation_config.static_forward_context, + self.kv_caches, num_attn_module) + return kv_caches + + def maybe_add_kv_sharing_layers_to_kv_cache_groups( + self, kv_cache_config: KVCacheConfig) -> None: + """ + Add layers that re-use KV cache to KV cache group of its target layer. + Mapping of KV cache tensors happens in `initialize_kv_cache_tensors()` + """ + if not self.shared_kv_cache_layers: + # No cross-layer KV sharing, return + return + + add_kv_sharing_layers_to_kv_cache_groups( + self.shared_kv_cache_layers, + kv_cache_config.kv_cache_groups, + self.runner_only_attn_layers, + ) + + if self.cache_config.kv_sharing_fast_prefill: + # In You Only Cache Once (https://arxiv.org/abs/2405.05254) or other + # similar KV sharing setups, only the layers that generate KV caches + # are involved in the prefill phase, enabling prefill to early exit. + attn_layers = get_layers_from_vllm_config(self.vllm_config, + Attention) + for layer_name in reversed(attn_layers): + if layer_name in self.shared_kv_cache_layers: + self.kv_sharing_fast_prefill_eligible_layers.add( + layer_name) + else: + break + + def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: + """ + Initialize KV cache based on `kv_cache_config`. + Args: + kv_cache_config: Configuration for the KV cache, including the KV + cache size of each layer + """ + kv_cache_config = deepcopy(kv_cache_config) + self.kv_cache_config = kv_cache_config + self.may_reinitialize_input_batch(kv_cache_config) + self.may_add_encoder_only_layers_to_kv_cache_config() + self.maybe_add_kv_sharing_layers_to_kv_cache_groups(kv_cache_config) + self.initialize_attn_backend(kv_cache_config) + kv_caches = self.initialize_kv_cache_tensors(kv_cache_config) + + if self.speculative_config and self.speculative_config.use_eagle(): + assert isinstance(self.drafter, EagleProposer) + # validate all draft model layers belong to the same kv cache + # group + self.drafter.validate_same_kv_cache_group(kv_cache_config) + + if has_kv_transfer_group(): + get_kv_transfer_group().register_kv_caches(kv_caches) + if self.device.type == 'xpu': + get_kv_transfer_group().set_host_xfer_buffer_ops( + copy_kv_blocks) + + if self.dcp_world_size > 1: + layer_names = self.attn_groups[0][0].layer_names + layers = get_layers_from_vllm_config(self.vllm_config, + AttentionLayerBase, + layer_names) + for layer in layers.values(): + assert layer.impl.need_to_return_lse_for_decode, ( + "DCP requires attention impls to return" + " the softmax lse for decode, but the impl " + f"{layer.impl.__class__.__name__} " + "does not return the softmax lse for decode.") + + def may_add_encoder_only_layers_to_kv_cache_config(self) -> None: + """ + Add encoder-only layers to the KV cache config. + """ + block_size = self.vllm_config.cache_config.block_size + encoder_only_attn_specs: dict[AttentionSpec, + list[str]] = defaultdict(list) + attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention) + for layer_name, attn_module in attn_layers.items(): + if attn_module.attn_type == AttentionType.ENCODER_ONLY: + attn_spec: AttentionSpec = EncoderOnlyAttentionSpec( + block_size=block_size, + num_kv_heads=attn_module.num_kv_heads, + head_size=attn_module.head_size, + dtype=self.kv_cache_dtype) + encoder_only_attn_specs[attn_spec].append(layer_name) + self.runner_only_attn_layers.add(layer_name) + if len(encoder_only_attn_specs) > 0: + assert len( + encoder_only_attn_specs + ) == 1, "Only support one encoder-only attention spec now" + spec, layer_names = encoder_only_attn_specs.popitem() + self.kv_cache_config.kv_cache_groups.append( + KVCacheGroupSpec(layer_names=layer_names, kv_cache_spec=spec)) + + def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: + """ + Generates the KVCacheSpec by parsing the kv cache format from each + Attention module in the static forward context. + Returns: + KVCacheSpec: A dictionary mapping layer names to their KV cache + format. Layers that do not need KV cache are not included. + """ + + block_size = self.vllm_config.cache_config.block_size + use_mla = self.vllm_config.model_config.use_mla + cache_dtype_str = self.vllm_config.cache_config.cache_dtype + kv_cache_spec: dict[str, KVCacheSpec] = {} + attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention) + for layer_name, attn_module in attn_layers.items(): + if (kv_tgt_layer := + attn_module.kv_sharing_target_layer_name) is not None: + # The layer doesn't need its own KV cache and will use that of + # the target layer. We skip creating a KVCacheSpec for it, so + # that KV cache management logic will act as this layer does + # not exist, and doesn't allocate KV cache for the layer. This + # enables the memory saving of cross-layer kv sharing, allowing + # a given amount of memory to accommodate longer context lengths + # or enable more requests to be processed simultaneously. + self.shared_kv_cache_layers[layer_name] = kv_tgt_layer + continue + + # TODO(lucas): move the attention specs into the model layers like + # the attention backends + if attn_module.attn_type == AttentionType.DECODER: + if attn_module.sliding_window is not None: + assert not use_mla, "MLA is not supported for sliding" \ + "window" + kv_cache_spec[layer_name] = SlidingWindowSpec( + block_size=block_size, + num_kv_heads=attn_module.num_kv_heads, + head_size=attn_module.head_size, + dtype=self.kv_cache_dtype, + sliding_window=attn_module.sliding_window) + elif use_mla: + kv_cache_spec[layer_name] = MLAAttentionSpec( + block_size=block_size, + num_kv_heads=attn_module.num_kv_heads, + head_size=attn_module.head_size, + dtype=self.kv_cache_dtype, + cache_dtype_str=cache_dtype_str) + elif self.attention_chunk_size is not None \ + and isinstance(attn_module, ChunkedLocalAttention): + kv_cache_spec[layer_name] = ChunkedLocalAttentionSpec( + block_size=block_size, + num_kv_heads=attn_module.num_kv_heads, + head_size=attn_module.head_size, + dtype=self.kv_cache_dtype, + attention_chunk_size=self.attention_chunk_size) + else: + kv_cache_spec[layer_name] = FullAttentionSpec( + block_size=block_size, + num_kv_heads=attn_module.num_kv_heads, + head_size=attn_module.head_size, + dtype=self.kv_cache_dtype) + elif attn_module.attn_type == AttentionType.ENCODER_DECODER: + kv_cache_spec[layer_name] = CrossAttentionSpec( + block_size=block_size, + num_kv_heads=attn_module.num_kv_heads, + head_size=attn_module.head_size, + dtype=self.kv_cache_dtype) + elif attn_module.attn_type in (AttentionType.ENCODER, + AttentionType.ENCODER_ONLY): + # encoder-only attention does not need KV cache. + continue + else: + raise ValueError( + f"Unknown attention type: {attn_module.attn_type}") + + mamba_layers = get_layers_from_vllm_config(self.vllm_config, MambaBase) + if len(mamba_layers) > 0: + if (self.vllm_config.speculative_config is not None + and self.vllm_config.model_config.hf_config.model_type + not in ["qwen3_next"]): + raise NotImplementedError( + "Mamba with speculative decoding is not supported yet.") + if self.vllm_config.cache_config.enable_prefix_caching: + raise NotImplementedError( + "Prefix caching is not supported for Mamba yet.") + max_model_len = self.vllm_config.model_config.max_model_len + + page_size_padded = ( + self.vllm_config.cache_config.mamba_page_size_padded) + + # Set block_size to max_model_len, so that mamba model will always + # have only one block in the KV cache. + for layer_name, mamba_module in mamba_layers.items(): + kv_cache_spec[layer_name] = MambaSpec( + shapes=mamba_module.get_state_shape(), + dtypes=mamba_module.get_state_dtype(), + block_size=max_model_len, + page_size_padded=page_size_padded, + mamba_type=mamba_module.mamba_type, + num_speculative_blocks=( + self.speculative_config.num_speculative_tokens + if self.speculative_config else 0), + ) + ds_indexer_layers = get_layers_from_vllm_config( + self.vllm_config, DeepseekV32IndexerCache) + for layer_name, ds_indexer_module in ds_indexer_layers.items(): + kv_cache_spec[layer_name] = ds_indexer_module.get_kv_cache_spec() + + return kv_cache_spec + + def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]: + # This is a short term mitigation for issue mentioned in + # https://github.com/vllm-project/vllm/issues/22754. + # `tolist` would trigger a cuda wise stream sync, which + # would block other copy ops from other cuda streams. + # A cuda event sync would avoid such a situation. Since + # this is in the critical path of every single model + # forward loop, this has caused perf issue for a disagg + # setup. + pinned = self.sampled_token_ids_pinned_cpu[:sampled_token_ids.shape[0]] + pinned.copy_(sampled_token_ids, non_blocking=True) + self.transfer_event.record() + self.transfer_event.synchronize() + return pinned.tolist() diff --git a/vllm_kunlun/vllm_utils_wrapper.py b/vllm_kunlun/vllm_utils_wrapper.py index 79b4eb8..2ed056d 100644 --- a/vllm_kunlun/vllm_utils_wrapper.py +++ b/vllm_kunlun/vllm_utils_wrapper.py @@ -1,19 +1,3 @@ -# -# Copyright (c) 2025 Baidu, Inc. All Rights Reserved. -# -# This file is a part of the vllm-kunlun project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. """vllm_utils_wrapper.py""" import vllm.distributed.parallel_state as parallel_state @@ -61,7 +45,7 @@ vllm_lib = Library("vllm", "FRAGMENT") # noqa def direct_register_custom_op( op_name: str, op_func: Callable, - mutates_args: list[str], + mutates_args: Optional[list[str]] = None, fake_impl: Optional[Callable] = None, target_lib: Optional[Library] = None, dispatch_key: str = "CUDA", @@ -91,7 +75,8 @@ def direct_register_custom_op( "use vLLM in a fresh new environment and let it install " "the required dependencies.") return - + if mutates_args is None: + mutates_args = [] import torch.library if hasattr(torch.library, "infer_schema"): patched_func = patch_annotations_for_schema(op_func) @@ -114,7 +99,10 @@ def vllm_kunlun_weak_ref_tensor(tensor: Any) -> Any: but will not keep the original tensor alive. """ # return tensor - return torch.ops._kunlun.weak_ref_tensor(tensor) + if isinstance(tensor, torch.Tensor): + return torch.ops._kunlun.weak_ref_tensor(tensor) + else: + return tensor def vllm_kunlun_weak_ref_tensors( tensors: Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor]] @@ -192,12 +180,11 @@ parallel_state.GroupCoordinator.all_gather = vllm_kunlun_all_gather from torch.library import custom_op, impl +import torch from vllm import _custom_ops as ops from typing import Optional, List +import os -################################################## -# --------------- Not yet supported -------------- -################################################## @custom_op("_C::rms_norm", mutates_args=()) def rms_norm( result : torch.Tensor, @@ -336,13 +323,52 @@ def rms_norm_dynamic_per_token_quant_xpu( )->None: pass +@custom_op("_C::silu_and_mul", mutates_args=()) +def silu_and_mul( + result : torch.Tensor, + input: torch.Tensor, + residual: torch.Tensor, + weight: torch.Tensor, + scale: torch.Tensor, + epsilon: float +)->None: + pass +@impl("_C::silu_and_mul", "CUDA") +def silu_and_mul_xpu( + result : torch.Tensor, + input: torch.Tensor, + residual: torch.Tensor, + weight: torch.Tensor, + scale: torch.Tensor, + epsilon: float +)->None: + pass +@custom_op("_C::silu_and_mul_quant", mutates_args=()) +def silu_and_mul_quant( + result : torch.Tensor, + input: torch.Tensor, + residual: torch.Tensor, + weight: torch.Tensor, + scale: torch.Tensor, + epsilon: float +)->None: + pass +@impl("_C::silu_and_mul_quant", "CUDA") +def silu_and_mul_quant_xpu( + result : torch.Tensor, + input: torch.Tensor, + residual: torch.Tensor, + weight: torch.Tensor, + scale: torch.Tensor, + epsilon: float +)->None: + pass + +import torch import xtorch_ops from torch.library import custom_op, impl -################################################## -# --------------- Norm -------------- -################################################## @custom_op("_C::add_rmsnorm", mutates_args=()) def add_rmsnorm( x: torch.Tensor, @@ -359,13 +385,14 @@ def add_rmsnorm( ) -> None: xtorch_ops.add_rmsnorm( x, - y, + y, # 原来写 residual,这里其实是 y residual_output=residual_output, weight=weight, eps=eps, output=output, ) + @impl("_C::add_rmsnorm", "CUDA") def add_rmsnorm_cuda( x: torch.Tensor, @@ -389,6 +416,7 @@ def add_rmsnorm_cuda( output=output, ) + @custom_op("_C::rmsnorm", mutates_args=()) def rmsnorm( x: torch.Tensor, @@ -426,10 +454,13 @@ def rmsnorm_cuda( output, eps, ) + +import torch + def _fake_rmsnorm(x, weight, output, eps=1e-5, interweave=False, store_output_before_norm=True, bias=None, residual_output=None, output_max=None): - # Set shape/dtype, but do not return value + # 设置 shape/dtype,但不返回值 output.fake_shape = x.shape output.fake_dtype = x.dtype return None @@ -446,546 +477,6 @@ def _fake_add_rmsnorm(x, y, weight, output, eps=1e-5, add_rmsnorm.register_fake(_fake_add_rmsnorm) - -################################################## -# --------------- Fake FC -------------- -################################################## -# register fake op impl here -# for torch.dynamo -from torch.library import register_fake -if hasattr(torch.ops.custom_ops, "fc_fusion"): - @register_fake("custom_ops::fc_fusion") - def fc_fusion_fake(self: torch.Tensor, - other: torch.Tensor, - bias: Optional[torch.Tensor], - self_trans: bool, - other_trans: bool, - *, - alpha: float=1.0, - beta: float=0.0, - act: int=1, - multi_stream: bool=False, - out: torch.Tensor - ) -> None: - pass - - -################################################## -# --------------- Activation-------------- -################################################## -@custom_op("_C::swiglu", mutates_args=()) -def swiglu( - x: torch.Tensor, - y: torch.Tensor, - axis: int=-1, - turn: bool=True -) -> None: - xtorch_ops.swiglu( - x, - y, - ) - -@impl("_C::swiglu", "CUDA") -def swiglu_cuda( - x: torch.Tensor, - y: torch.Tensor, - axis: int=-1, - turn: bool=True -) -> None: - xtorch_ops.swiglu( - x, - y, - ) - -def _fake_swiglu( - x: torch.Tensor, - y: torch.Tensor, - axis: int=-1, - turn: bool=True): - return None - -swiglu.register_fake(_fake_swiglu) - -@custom_op("_C::swigluoai_and_mul", mutates_args=()) -def swigluoai_and_mul( - x: torch.Tensor, - alpha: float = 1.702, - limit: float = 7.0, - axis: int = -1, - turn: bool = True -) -> torch.Tensor: - """PyTorch-native implementation equivalent to forward().""" - gate, up = x[..., ::2], x[..., 1::2] - gate = gate.clamp(min=None, max=limit) - up = up.clamp(min=-limit, max=limit) - glu = gate * torch.sigmoid(gate * alpha) - gated_output = (up + 1) * glu - return gated_output - -@impl("_C::swigluoai_and_mul", "CUDA") -def swigluoai_and_mul_cuda( - x: torch.Tensor, - alpha: float = 1.702, - limit: float = 7.0, - axis: int = -1, - turn: bool = True -) -> torch.Tensor: - """PyTorch-native implementation equivalent to forward().""" - gate, up = x[..., ::2], x[..., 1::2] - gate = gate.clamp(min=None, max=limit) - up = up.clamp(min=-limit, max=limit) - glu = gate * torch.sigmoid(gate * alpha) - gated_output = (up + 1) * glu - return gated_output - -def _fake_swigluoai_and_mul( - x: torch.Tensor, - alpha: float = 1.702, - limit: float = 7.0, - axis: int = -1, - turn: bool = True -) -> torch.Tensor: - """PyTorch-native implementation equivalent to forward().""" - gate, up = x[..., ::2], x[..., 1::2] - gate = gate.clamp(min=None, max=limit) - up = up.clamp(min=-limit, max=limit) - glu = gate * torch.sigmoid(gate * alpha) - gated_output = (up + 1) * glu - return gated_output - -swigluoai_and_mul.register_fake(_fake_swigluoai_and_mul) - - -################################################## -# --------------- Moe ----------------- -################################################## -@custom_op("_C::moe_softmax_topk", mutates_args=()) -def moe_softmax_topk( - x: torch.Tensor, - normed_score: torch.Tensor, - topk_index: torch.Tensor, - block_statistic: torch.Tensor, - axis: int = -1, - turn: bool = True -) -> None: - xtorch_ops.moe_softmax_topk( - x, - normed_score, - topk_index, - block_statistic - ) - -@impl("_C::moe_softmax_topk", "CUDA") -def moe_softmax_topk_cuda( - x: torch.Tensor, - normed_score: torch.Tensor, - topk_index: torch.Tensor, - block_statistic: torch.Tensor, - axis: int = -1, - turn: bool = True -) -> None: - xtorch_ops.moe_softmax_topk( - x, - normed_score, - topk_index, - block_statistic - ) - -def _fake_moe_softmax_topk( - x: torch.Tensor, - normed_score: torch.Tensor, - topk_index: torch.Tensor, - block_statistic: torch.Tensor, - axis: int = -1, - turn: bool = True -) -> None: - return None - -moe_softmax_topk.register_fake(_fake_moe_softmax_topk) - -@custom_op("_C::moe_ffn_block", mutates_args=()) -def moe_ffn_block( - out: torch.Tensor, - x: torch.Tensor, - expert_num: int, - moe_top_k: int, - gate_w: torch.Tensor, - inter_w: torch.Tensor, - output_w: torch.Tensor, - renormalize: bool = True, - use_grouped_topk: bool = False, - expert_group_num: Optional[int] = 0, - topk_group: Optional[int] = 0, - w1_bias: Optional[torch.Tensor] = None, - w2_bias: Optional[torch.Tensor] = None, -) -> None: - xtorch_ops.moe_ffn_block( - x=x, - gate_w=gate_w, - inter_w=inter_w, - output_w=output_w, - expert_num=expert_num, - moe_top_k=moe_top_k, - topk_group=topk_group, - renormalize=renormalize, - use_grouped_topk=use_grouped_topk, - expert_group_num=expert_group_num, - out=out, - ) - - -@impl("_C::moe_ffn_block", "CUDA") -def moe_ffn_block_cuda( - out: torch.Tensor, - x: torch.Tensor, - expert_num: int, - moe_top_k: int, - gate_w: torch.Tensor, - inter_w: torch.Tensor, - output_w: torch.Tensor, - renormalize: bool = True, - use_grouped_topk: bool = False, - expert_group_num: Optional[int] = 0, - topk_group: Optional[int] = 0, - w1_bias: Optional[torch.Tensor] = None, - w2_bias: Optional[torch.Tensor] = None, -) -> None: - xtorch_ops.moe_ffn_block( - x=x, - gate_w=gate_w, - inter_w=inter_w, - output_w=output_w, - expert_num=expert_num, - moe_top_k=moe_top_k, - topk_group=topk_group, - renormalize=renormalize, - use_grouped_topk=use_grouped_topk, - expert_group_num=expert_group_num, - out=out, - ) - -def _fake_moe_ffn_block( - out: torch.Tensor, - x: torch.Tensor, - expert_num: int, - moe_top_k: int, - gate_w: torch.Tensor, - inter_w: torch.Tensor, - output_w: torch.Tensor, - renormalize: bool = True, - use_grouped_topk: bool = False, - expert_group_num: Optional[int] = 0, - topk_group: Optional[int] = 0,): - return None - -moe_ffn_block.register_fake(_fake_moe_ffn_block) - -@custom_op("_C::moe_ffn_per_token_block", mutates_args=()) -def moe_ffn_per_token_block( - x: torch.Tensor, - inter_weight: torch.Tensor, - inter_scale: torch.Tensor, - outer_weight: torch.Tensor, - outer_scale: torch.Tensor, - top_k: int, - global_num_experts: int, - linear_weights: Optional[torch.Tensor] = None, - expert_map: Optional[torch.Tensor] = None, - activation: str = "silu", - output: Optional[torch.Tensor] = None, - use_expert_parallel: bool = False, - ep_size: int = 1, - ep_rank: int = 0 -) -> None: - xtorch_ops.moe_ffn_per_token_block( - x=x, - inter_weight=inter_weight, - inter_scale=inter_scale, - outer_weight=outer_weight, - outer_scale=outer_scale, - gate_weight=linear_weights, - expert_num=global_num_experts, - moe_top_k=top_k, - act_type=activation, - use_expert_parallel=use_expert_parallel, - ep_size=ep_size, - ep_rank=ep_rank, - out=output, - ) - -@impl("_C::moe_ffn_per_token_block", "CUDA") -def moe_ffn_per_token_block_cuda( - x: torch.Tensor, - inter_weight: torch.Tensor, - inter_scale: torch.Tensor, - outer_weight: torch.Tensor, - outer_scale: torch.Tensor, - top_k: int, - global_num_experts: int, - linear_weights: Optional[torch.Tensor] = None, - expert_map: Optional[torch.Tensor] = None, - activation: str = "silu", - output: Optional[torch.Tensor] = None, - use_expert_parallel: bool = False, - ep_size: int = 1, - ep_rank: int = 0 -) -> None: - xtorch_ops.moe_ffn_per_token_block( - x=x, - inter_weight=inter_weight, - inter_scale=inter_scale, - outer_weight=outer_weight, - outer_scale=outer_scale, - gate_weight=linear_weights, - expert_num=global_num_experts, - moe_top_k=top_k, - act_type=activation, - use_expert_parallel=use_expert_parallel, - ep_size=ep_size, - ep_rank=ep_rank, - out=output, - ) - -def _fake_moe_ffn_per_token_block( - x: torch.Tensor, - inter_weight: torch.Tensor, - inter_scale: torch.Tensor, - outer_weight: torch.Tensor, - outer_scale: torch.Tensor, - top_k: int, - global_num_experts: int, - linear_weights: Optional[torch.Tensor] = None, - expert_map: Optional[torch.Tensor] = None, - activation: str = "silu", - output: Optional[torch.Tensor] = None, - use_expert_parallel: bool = False, - ep_size: int = 1, - ep_rank: int = 0 -) -> None: - # Fake implementation can be a no-op or a simple operation - if output is not None: - output.copy_(x) # Example: simply copy input to output - -# Register the fake implementation -moe_ffn_per_token_block.register_fake(_fake_moe_ffn_per_token_block) - -@custom_op("_C::gen_block_statistic", mutates_args=()) -def gen_block_statistic( - topk_ids: torch.Tensor, - block_statistic: torch.Tensor) -> None: - - xtorch_ops.gen_block_statistic(topk_ids, block_statistic) - - -@impl("_C::gen_block_statistic", "CUDA") -def gen_block_statistic_cuda( - topk_ids: torch.Tensor, - block_statistic: torch.Tensor) -> None: - xtorch_ops.gen_block_statistic(topk_ids, block_statistic) - -def _fake_gen_block_statistic( - topk_ids: torch.Tensor, - block_statistic: torch.Tensor) -> None: - return None - -gen_block_statistic.register_fake(_fake_gen_block_statistic) - - -@custom_op("_C::moe_pre_sorted", mutates_args=()) -def moe_pre_sorted( - x: torch.Tensor, - topk_index: torch.Tensor, - block_statistic: torch.Tensor, - moe_expand: torch.Tensor, - moe_index: torch.Tensor, - expert_m: torch.Tensor, - sorted_tokens_num_lod: torch.Tensor, - index_have_neg: bool = False) -> None: - - xtorch_ops.moe_pre_sorted( - x, topk_index, block_statistic, moe_expand, moe_index, expert_m, sorted_tokens_num_lod, index_have_neg) - - -@impl("_C::moe_pre_sorted","CUDA") -def moe_pre_sorted_cuda( - x: torch.Tensor, - topk_index: torch.Tensor, - block_statistic: torch.Tensor, - moe_expand: torch.Tensor, - moe_index: torch.Tensor, - expert_m: torch.Tensor, - sorted_tokens_num_lod: torch.Tensor, - index_have_neg: bool = False) -> None: - - xtorch_ops.moe_pre_sorted( - x, topk_index, block_statistic, moe_expand, moe_index, expert_m, sorted_tokens_num_lod, index_have_neg) - -def _fake_moe_pre_sorted( - x: torch.Tensor, - topk_index: torch.Tensor, - block_statistic: torch.Tensor, - moe_expand: torch.Tensor, - moe_index: torch.Tensor, - expert_m: torch.Tensor, - sorted_tokens_num_lod: torch.Tensor, - index_have_neg: bool = False) -> None: - return None - -moe_pre_sorted.register_fake(_fake_moe_pre_sorted) - -@custom_op("_C::moe_post", mutates_args=()) -def moe_post( - x: torch.Tensor, - moe_index: torch.Tensor, - normed_scale: torch.Tensor, - dequant_scale: torch.Tensor, - y: torch.Tensor) -> None: - - xtorch_ops.moe_post(x, moe_index, normed_scale, dequant_scale, y) - - -@impl("_C::moe_post", "CUDA") -def moe_post_cuda( - x: torch.Tensor, - moe_index: torch.Tensor, - normed_scale: torch.Tensor, - dequant_scale: torch.Tensor, - y: torch.Tensor)-> None: - - xtorch_ops.moe_post(x, moe_index, normed_scale, dequant_scale, y) - -def _fake_moe_post( - x: torch.Tensor, - moe_index: torch.Tensor, - normed_scale: torch.Tensor, - dequant_scale: torch.Tensor, - y: torch.Tensor)-> None: - return None - -moe_post.register_fake(_fake_moe_post) - -@custom_op("_C::moe_fc", mutates_args=()) -def moe_fc( - x: torch.Tensor, - weight: torch.Tensor, - sorted_tokens_num_lod: torch.Tensor, - sorted_tokens_idx: torch.Tensor, - moe_topk: int, - y: torch.Tensor, - act: torch.Tensor, - x_perchannel_max: torch.Tensor, - w_perchannel_max: torch.Tensor, - topk_ids: torch.Tensor, - topk_w: torch.Tensor, - bias: torch.Tensor, - tgemm_type: torch.Tensor, - tweight_type: torch.Tensor, - scale_n: int, - scale_k: int, - use_pack_int4: bool) -> None: - - xtorch_ops.moe_fc( - x, weight, sorted_tokens_num_lod, sorted_tokens_idx, moe_topk, y, act, - x_perchannel_max, w_perchannel_max, topk_ids, topk_w, - bias, tgemm_type, tweight_type, scale_n, scale_k, use_pack_int4) - -@impl("_C::moe_fc", "CUDA") -def moe_fc_cuda( - x: torch.Tensor, - weight: torch.Tensor, - sorted_tokens_num_lod: torch.Tensor, - sorted_tokens_idx: torch.Tensor, - moe_topk: int, - y: torch.Tensor, - act: torch.Tensor, - x_perchannel_max: torch.Tensor, - w_perchannel_max: torch.Tensor, - topk_ids: torch.Tensor, - topk_w: torch.Tensor, - bias: torch.Tensor, - tgemm_type: torch.Tensor, - tweight_type: torch.Tensor, - scale_n: int, - scale_k: int, - use_pack_int4: bool) -> None: - - xtorch_ops.moe_fc( - x, weight, sorted_tokens_num_lod, sorted_tokens_idx, moe_topk, y, act, - x_perchannel_max, w_perchannel_max, topk_ids, topk_w, - bias, tgemm_type, tweight_type, scale_n, scale_k, use_pack_int4) - -def _fake_moe_fc( - x: torch.Tensor, - weight: torch.Tensor, - sorted_tokens_num_lod: torch.Tensor, - sorted_tokens_idx: torch.Tensor, - moe_topk: int, - y: torch.Tensor, - act: torch.Tensor, - x_perchannel_max: torch.Tensor, - w_perchannel_max: torch.Tensor, - topk_ids: torch.Tensor, - topk_w: torch.Tensor, - bias: torch.Tensor, - tgemm_type: torch.Tensor, - tweight_type: torch.Tensor, - scale_n: int, - scale_k: int, - use_pack_int4: bool) -> None: - return None - - -################################################## -# --------------- rotary_embedding ----------------- -################################################## -@custom_op("_C::rotary_embedding", mutates_args=()) -def rotary_embedding( - positions: torch.Tensor, - query: torch.Tensor, - key: torch.Tensor, - head_size: int, - cos_sin_cache: torch.Tensor, - is_neox: bool, -) -> None : - xtorch_ops.rotary_embedding( - positions=positions, - query=query, - key=key, - head_size=head_size, - cos_sin_cache=cos_sin_cache, - is_neox=is_neox) - -@impl("_C::rotary_embedding", "CUDA") -def rotary_embedding_cuda( - positions: torch.Tensor, - query: torch.Tensor, - key: torch.Tensor, - head_size: int, - cos_sin_cache: torch.Tensor, - is_neox: bool, -) -> None: - xtorch_ops.rotary_embedding( - positions=positions, - query=query, - key=key, - head_size=head_size, - cos_sin_cache=cos_sin_cache, - is_neox=is_neox) - - -def _fake_rotary_embedding( - positions: torch.Tensor, - query: torch.Tensor, - key: torch.Tensor, - head_size: int, - cos_sin_cache: torch.Tensor, - is_neox: bool, -)-> None: - return None - - -rotary_embedding.register_fake(_fake_rotary_embedding) - @custom_op("_C::split_norm_rope_neox", mutates_args=()) def split_norm_rope_neox( q_emb: torch.Tensor, @@ -1082,173 +573,579 @@ def _fake_split_norm_rope_neox( split_norm_rope_neox.register_fake(_fake_split_norm_rope_neox) -################################################## -# --------------- awq_dequantize ----------------- -################################################## -@custom_op("_C::awq_dequantize", mutates_args=()) -def awq_dequantize( - qweight: torch.Tensor, - scales: torch.Tensor, - zeros: torch.Tensor, - quant_type: int = 0, - align_type: int = 1, -) -> torch.Tensor: - weight = torch.empty( - (qweight.shape[0], qweight.shape[1] * 8), - dtype=torch.float16, - device=qweight.device, - ) - group_m = int(qweight.shape[0] / scales.shape[0]) - xtorch_ops.awq_dequantize( - qweight=qweight, - scales=scales, - zeros=zeros, - weight=weight, - group_m=group_m, - quant_type=quant_type, - align_type=align_type, - ) - return weight +# register fake op impl here +# for torch.dynamo +from torch.library import register_fake +if hasattr(torch.ops.custom_ops, "fc_fusion"): + @register_fake("custom_ops::fc_fusion") + def fc_fusion_fake(self: torch.Tensor, + other: torch.Tensor, + bias: Optional[torch.Tensor], + self_trans: bool, + other_trans: bool, + *, + alpha: float=1.0, + beta: float=0.0, + act: int=1, + multi_stream: bool=False, + out: torch.Tensor + ) -> None: + pass - -@impl("_C::awq_dequantize", "CUDA") -def awq_dequantize_cuda( - qweight: torch.Tensor, - scales: torch.Tensor, - zeros: torch.Tensor, - quant_type: int = 0, - align_type: int = 1, -) -> torch.Tensor: - weight = torch.empty( - (qweight.shape[0], qweight.shape[1] * 8), - dtype=torch.float16, - device=qweight.device, - ) - group_m = int(qweight.shape[0] / scales.shape[0]) - out = xtorch_ops.awq_dequantize( - qweight=qweight, - scales=scales, - zeros=zeros, - weight=weight, - group_m=group_m, - quant_type=quant_type, - align_type=align_type, - ) - return weight - - -def _fake_awq_dequantize( - qweight: torch.Tensor, - scales: torch.Tensor, - zeros: torch.Tensor, - quant_type: int = 0, - align_type: int = 1, -) -> torch.Tensor: - weight = torch.empty( - (qweight.shape[0], qweight.shape[1] * 8), - dtype=torch.float16, - device=qweight.device, - ) - return weight - - -awq_dequantize.register_fake(_fake_awq_dequantize) - - -################################################## -# ------------------ awq_gemm ------------------- -################################################## -@custom_op("_C::awq_gemm", mutates_args=()) -def awq_gemm( +@custom_op("_C::swiglu", mutates_args=()) +def swiglu( x: torch.Tensor, - qweight: torch.Tensor, - scale: torch.Tensor, - zeros: torch.Tensor, - align_type: int = 1, -) -> torch.Tensor: - out = torch.empty( - (x.shape[0], qweight.shape[1] * 8), dtype=torch.float16, device=x.device - ) - group_size = int(qweight.shape[0] / scale.shape[0]) - xtorch_ops.awq_gemm( - x=x, - w=qweight, - scale=scale, - zeros=zeros, - out=out, - align_type=align_type, - group_size=group_size, - ) - return out - - -@impl("_C::awq_gemm", "CUDA") -def awq_gemm_cuda( - x: torch.Tensor, - qweight: torch.Tensor, - scale: torch.Tensor, - zeros: torch.Tensor, - align_type: int = 1, -) -> torch.Tensor: - out = torch.empty( - (x.shape[0], qweight.shape[1] * 8), dtype=torch.float16, device=x.device - ) - group_size = int(qweight.shape[0] / scale.shape[0]) - xtorch_ops.awq_gemm( - x=x, - w=qweight, - scale=scale, - zeros=zeros, - out=out, - align_type=align_type, - group_size=group_size, - ) - return out - - -def _fake_awq_gemm( - x: torch.Tensor, - qweight: torch.Tensor, - scale: torch.Tensor, - zeros: torch.Tensor, - align_type: int = 1, -) -> torch.Tensor: - out = torch.empty( - (x.shape[0], qweight.shape[1] * 8), dtype=torch.float16, device=x.device - ) - return out - - -awq_gemm.register_fake(_fake_awq_gemm) - - -################################################## -# ---------------- gptq_shuffle ------------------ -################################################## -@custom_op("_C::gptq_shuffle", mutates_args=()) -def gptq_shuffle( - q_weight: torch.Tensor, - q_perm: torch.Tensor, - bit: int, + y: torch.Tensor, + axis: int=-1, + turn: bool=True ) -> None: - xtorch_ops.gptq_shuffle(weight=q_weight, perm=q_perm, bit=bit) + xtorch_ops.swiglu( + x, + y, + ) -@impl("_C::gptq_shuffle", "CUDA") -def gptq_shuffle_cuda( - q_weight: torch.Tensor, - q_perm: torch.Tensor, - bit: int, +@impl("_C::swiglu", "CUDA") +def swiglu_cuda( + x: torch.Tensor, + y: torch.Tensor, + axis: int=-1, + turn: bool=True ) -> None: - xtorch_ops.gptq_shuffle(weight=q_weight, perm=q_perm, bit=bit) + xtorch_ops.swiglu( + x, + y, + ) + +def _fake_swiglu( + x: torch.Tensor, + y: torch.Tensor, + axis: int=-1, + turn: bool=True): + return None + +swiglu.register_fake(_fake_swiglu) -def _fake_gptq_shuffle( - q_weight: torch.Tensor, - q_perm: torch.Tensor, - bit: int, + +@custom_op("_C::swigluoai_and_mul", mutates_args=()) +def swigluoai_and_mul( + x: torch.Tensor, + alpha: float = 1.702, + limit: float = 7.0, + axis: int = -1, + turn: bool = True +) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + gate, up = x[..., ::2], x[..., 1::2] + gate = gate.clamp(min=None, max=limit) + up = up.clamp(min=-limit, max=limit) + glu = gate * torch.sigmoid(gate * alpha) + gated_output = (up + 1) * glu + return gated_output + +@impl("_C::swigluoai_and_mul", "CUDA") +def swigluoai_and_mul_cuda( + x: torch.Tensor, + alpha: float = 1.702, + limit: float = 7.0, + axis: int = -1, + turn: bool = True +) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + gate, up = x[..., ::2], x[..., 1::2] + gate = gate.clamp(min=None, max=limit) + up = up.clamp(min=-limit, max=limit) + glu = gate * torch.sigmoid(gate * alpha) + gated_output = (up + 1) * glu + return gated_output + +def _fake_swigluoai_and_mul( + x: torch.Tensor, + alpha: float = 1.702, + limit: float = 7.0, + axis: int = -1, + turn: bool = True +) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + gate, up = x[..., ::2], x[..., 1::2] + gate = gate.clamp(min=None, max=limit) + up = up.clamp(min=-limit, max=limit) + glu = gate * torch.sigmoid(gate * alpha) + gated_output = (up + 1) * glu + return gated_output + +swigluoai_and_mul.register_fake(_fake_swigluoai_and_mul) + +@custom_op("_C::moe_softmax_topk", mutates_args=()) +def moe_softmax_topk( + x: torch.Tensor, + normed_score: torch.Tensor, + topk_index: torch.Tensor, + block_statistic: torch.Tensor, + axis: int = -1, + turn: bool = True +) -> None: + xtorch_ops.moe_softmax_topk( + x, + normed_score, + topk_index, + block_statistic + ) + +@impl("_C::moe_softmax_topk", "CUDA") +def moe_softmax_topk_cuda( + x: torch.Tensor, + normed_score: torch.Tensor, + topk_index: torch.Tensor, + block_statistic: torch.Tensor, + axis: int = -1, + turn: bool = True +) -> None: + xtorch_ops.moe_softmax_topk( + x, + normed_score, + topk_index, + block_statistic + ) + +def _fake_moe_softmax_topk( + x: torch.Tensor, + normed_score: torch.Tensor, + topk_index: torch.Tensor, + block_statistic: torch.Tensor, + axis: int = -1, + turn: bool = True ) -> None: return None +moe_softmax_topk.register_fake(_fake_moe_softmax_topk) -gptq_shuffle.register_fake(_fake_gptq_shuffle) + +@custom_op("_C::moe_ffn_block", mutates_args=()) +def moe_ffn_block( + out: torch.Tensor, + x: torch.Tensor, + expert_num: int, + moe_top_k: int, + gate_w: torch.Tensor, + inter_w: torch.Tensor, + output_w: torch.Tensor, + renormalize: bool = True, + use_grouped_topk: bool = False, + expert_group_num: Optional[int] = 0, + topk_group: Optional[int] = 0, + w1_bias: Optional[torch.Tensor] = None, + w2_bias: Optional[torch.Tensor] = None, +) -> None: + xtorch_ops.moe_ffn_block( + x=x, + gate_w=gate_w, + inter_w=inter_w, + output_w=output_w, + expert_num=expert_num, + moe_top_k=moe_top_k, + topk_group=topk_group, + renormalize=renormalize, + use_grouped_topk=use_grouped_topk, + expert_group_num=expert_group_num, + out=out, + ) + + +@impl("_C::moe_ffn_block", "CUDA") +def moe_ffn_block_cuda( + out: torch.Tensor, + x: torch.Tensor, + expert_num: int, + moe_top_k: int, + gate_w: torch.Tensor, + inter_w: torch.Tensor, + output_w: torch.Tensor, + renormalize: bool = True, + use_grouped_topk: bool = False, + expert_group_num: Optional[int] = 0, + topk_group: Optional[int] = 0, + w1_bias: Optional[torch.Tensor] = None, + w2_bias: Optional[torch.Tensor] = None, +) -> None: + xtorch_ops.moe_ffn_block( + x=x, + gate_w=gate_w, + inter_w=inter_w, + output_w=output_w, + expert_num=expert_num, + moe_top_k=moe_top_k, + topk_group=topk_group, + renormalize=renormalize, + use_grouped_topk=use_grouped_topk, + expert_group_num=expert_group_num, + out=out, + ) + +def _fake_moe_ffn_block( + out: torch.Tensor, + x: torch.Tensor, + expert_num: int, + moe_top_k: int, + gate_w: torch.Tensor, + inter_w: torch.Tensor, + output_w: torch.Tensor, + renormalize: bool = True, + use_grouped_topk: bool = False, + expert_group_num: Optional[int] = 0, + topk_group: Optional[int] = 0,): + return None + +moe_ffn_block.register_fake(_fake_moe_ffn_block) + + +@custom_op("_C::moe_ffn_per_token_block", mutates_args=()) +def moe_ffn_per_token_block( + x: torch.Tensor, + inter_weight: torch.Tensor, + inter_scale: torch.Tensor, + outer_weight: torch.Tensor, + outer_scale: torch.Tensor, + top_k: int, + global_num_experts: int, + linear_weights: Optional[torch.Tensor] = None, + expert_map: Optional[torch.Tensor] = None, + activation: str = "silu", + output: Optional[torch.Tensor] = None, + use_expert_parallel: bool = False, + ep_size: int = 1, + ep_rank: int = 0 +) -> None: + xtorch_ops.moe_ffn_per_token_block( + x=x, + inter_weight=inter_weight, + inter_scale=inter_scale, + outer_weight=outer_weight, + outer_scale=outer_scale, + gate_weight=linear_weights, + expert_num=global_num_experts, + moe_top_k=top_k, + act_type=activation, + use_expert_parallel=use_expert_parallel, + ep_size=ep_size, + ep_rank=ep_rank, + out=output, + ) + +@impl("_C::moe_ffn_per_token_block", "CUDA") +def moe_ffn_per_token_block_cuda( + x: torch.Tensor, + inter_weight: torch.Tensor, + inter_scale: torch.Tensor, + outer_weight: torch.Tensor, + outer_scale: torch.Tensor, + top_k: int, + global_num_experts: int, + linear_weights: Optional[torch.Tensor] = None, + expert_map: Optional[torch.Tensor] = None, + activation: str = "silu", + output: Optional[torch.Tensor] = None, + use_expert_parallel: bool = False, + ep_size: int = 1, + ep_rank: int = 0 +) -> None: + xtorch_ops.moe_ffn_per_token_block( + x=x, + inter_weight=inter_weight, + inter_scale=inter_scale, + outer_weight=outer_weight, + outer_scale=outer_scale, + gate_weight=linear_weights, + expert_num=global_num_experts, + moe_top_k=top_k, + act_type=activation, + use_expert_parallel=use_expert_parallel, + ep_size=ep_size, + ep_rank=ep_rank, + out=output, + ) + +def _fake_moe_ffn_per_token_block( + x: torch.Tensor, + inter_weight: torch.Tensor, + inter_scale: torch.Tensor, + outer_weight: torch.Tensor, + outer_scale: torch.Tensor, + top_k: int, + global_num_experts: int, + linear_weights: Optional[torch.Tensor] = None, + expert_map: Optional[torch.Tensor] = None, + activation: str = "silu", + output: Optional[torch.Tensor] = None, + use_expert_parallel: bool = False, + ep_size: int = 1, + ep_rank: int = 0 +) -> None: + # Fake implementation can be a no-op or a simple operation + if output is not None: + output.copy_(x) # Example: simply copy input to output + +# Register the fake implementation +moe_ffn_per_token_block.register_fake(_fake_moe_ffn_per_token_block) + + +@custom_op("_C::rotary_embedding", mutates_args=()) +def rotary_embedding( + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + head_size: int, + cos_sin_cache: torch.Tensor, + is_neox: bool, +) -> None : + xtorch_ops.rotary_embedding( + positions=positions, + query=query, + key=key, + head_size=head_size, + cos_sin_cache=cos_sin_cache, + is_neox=is_neox) + +@impl("_C::rotary_embedding", "CUDA") +def rotary_embedding_cuda( + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + head_size: int, + cos_sin_cache: torch.Tensor, + is_neox: bool, +) -> None: + xtorch_ops.rotary_embedding( + positions=positions, + query=query, + key=key, + head_size=head_size, + cos_sin_cache=cos_sin_cache, + is_neox=is_neox) + + +def _fake_rotary_embedding( + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + head_size: int, + cos_sin_cache: torch.Tensor, + is_neox: bool, +)-> None: + return None + + +rotary_embedding.register_fake(_fake_rotary_embedding) + +@custom_op("_C::moe_softmax_topk_norm", mutates_args=()) +def moe_softmax_topk_norm( + x: torch.Tensor, + normed_score: torch.Tensor, + topk_index: torch.Tensor, + block_statistic: torch.Tensor, + stable: bool = True +) -> None: + xtorch_ops.moe_softmax_topk_norm( + x, + normed_score, + topk_index, + block_statistic, + stable + ) + +@impl("_C::moe_softmax_topk_norm", "CUDA") +def moe_softmax_topk_norm_cuda( + x: torch.Tensor, + normed_score: torch.Tensor, + topk_index: torch.Tensor, + block_statistic: torch.Tensor, + stable: bool = True +) -> None: + xtorch_ops.moe_softmax_topk_norm( + x, + normed_score, + topk_index, + block_statistic, + stable + ) + +def _fake_moe_softmax_topk_norm( + x: torch.Tensor, + normed_score: torch.Tensor, + topk_index: torch.Tensor, + block_statistic: torch.Tensor, + stable: bool = True +) -> None: + return None + +moe_softmax_topk_norm.register_fake(_fake_moe_softmax_topk_norm) + +@custom_op("_C::gen_block_statistic", mutates_args=()) +def gen_block_statistic( + topk_ids: torch.Tensor, + block_statistic: torch.Tensor +)-> None: + xtorch_ops.gen_block_statistic( + topk_ids,block_statistic + ) + +@impl("_C::gen_block_statistic", "CUDA") +def gen_block_statistic_cuda( + topk_ids: torch.Tensor, + block_statistic: torch.Tensor +)-> None: + xtorch_ops.gen_block_statistic( + topk_ids,block_statistic + ) + +def fake_gen_block_statistic( + topk_ids: torch.Tensor, + block_statistic: torch.Tensor +)-> None: + return None + +gen_block_statistic.register_fake(fake_gen_block_statistic) + +@custom_op("_C::moe_pre_sorted", mutates_args=()) +def moe_pre_sorted( + x: torch.Tensor, + topk_index: torch.Tensor, + block_statistic: torch.Tensor, + moe_expand: torch.Tensor, + moe_index: torch.Tensor, + expert_m: torch.Tensor, + sorted_tokens_num_lod: torch.Tensor, + index_have_neg: bool = False +)-> None: + xtorch_ops.moe_pre_sorted( + x, + topk_index, + block_statistic, + moe_expand, + moe_index, + expert_m, + sorted_tokens_num_lod) + +@impl("_C::moe_pre_sorted", "CUDA") +def moe_pre_sorted_cuda( + x: torch.Tensor, + topk_index: torch.Tensor, + block_statistic: torch.Tensor, + moe_expand: torch.Tensor, + moe_index: torch.Tensor, + expert_m: torch.Tensor, + sorted_tokens_num_lod: torch.Tensor, + index_have_neg: bool = False +)-> None: + xtorch_ops.moe_pre_sorted( + x, + topk_index, + block_statistic, + moe_expand, + moe_index, + expert_m, + sorted_tokens_num_lod) + +def fake_moe_pre_sorted( + x: torch.Tensor, + topk_index: torch.Tensor, + block_statistic: torch.Tensor, + moe_expand: torch.Tensor, + moe_index: torch.Tensor, + expert_m: torch.Tensor, + sorted_tokens_num_lod: torch.Tensor, + index_have_neg: bool = False +)-> None: + return None + +moe_pre_sorted.register_fake(fake_moe_pre_sorted) + +@custom_op("_C::moe_fc", mutates_args=()) +def moe_fc( + x: torch.Tensor, + weight: torch.Tensor, + sorted_tokens_num_lod: torch.Tensor, + sorted_tokens_idx: torch.Tensor, + moe_topk: int, + y: torch.Tensor +)-> None: + xtorch_ops.moe_fc( + x, + weight, + sorted_tokens_num_lod, + sorted_tokens_idx, + moe_topk, + y) + +@impl("_C::moe_fc", "CUDA") +def moe_fc_cuda( + x: torch.Tensor, + weight: torch.Tensor, + sorted_tokens_num_lod: torch.Tensor, + sorted_tokens_idx: torch.Tensor, + moe_topk: int, + y: torch.Tensor +)-> None: + xtorch_ops.moe_fc( + x, + weight, + sorted_tokens_num_lod, + sorted_tokens_idx, + moe_topk, + y) + +def fake_moe_fc( + x: torch.Tensor, + weight: torch.Tensor, + sorted_tokens_num_lod: torch.Tensor, + sorted_tokens_idx: torch.Tensor, + moe_topk: int, + y: torch.Tensor +)-> None: + return None + +moe_fc.register_fake(fake_moe_fc) + +@custom_op("_C::moe_post", mutates_args=()) +def moe_post( + x: torch.Tensor, + moe_index: torch.Tensor, + normed_scale: torch.Tensor, + dequant_scale: torch.Tensor, + y: torch.Tensor +)-> None: + xtorch_ops.moe_post( + x, + moe_index, + normed_scale, + dequant_scale, + y + ) + +@impl("_C::moe_post", "CUDA") +def moe_post_cuda( + x: torch.Tensor, + moe_index: torch.Tensor, + normed_scale: torch.Tensor, + dequant_scale: torch.Tensor, + y: torch.Tensor +)-> None: + xtorch_ops.moe_post( + x, + moe_index, + normed_scale, + dequant_scale, + y) + + +def fake_moe_post( + x: torch.Tensor, + moe_index: torch.Tensor, + normed_scale: torch.Tensor, + dequant_scale: torch.Tensor, + y: torch.Tensor +)-> None: + return None + +moe_post.register_fake(fake_moe_post) diff --git a/vllm_kunlun/v1/sample/__init__.py b/vllm_kunlun/worker/__init__.py similarity index 100% rename from vllm_kunlun/v1/sample/__init__.py rename to vllm_kunlun/worker/__init__.py diff --git a/vllm_kunlun/worker/model_runner.py b/vllm_kunlun/worker/model_runner.py new file mode 100644 index 0000000..a1c08fa --- /dev/null +++ b/vllm_kunlun/worker/model_runner.py @@ -0,0 +1,2043 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import dataclasses +import gc +import inspect +import itertools +import time +import weakref +from contextlib import contextmanager +from dataclasses import dataclass +from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set, + Tuple, Type, TypeVar, Union) + +import numpy as np +import torch +import torch.distributed +import torch.nn as nn +from tqdm.auto import tqdm + +import vllm.envs as envs +from vllm.attention import AttentionMetadata, get_attn_backend +from vllm.attention.backends.abstract import AttentionState +from vllm.attention.backends.utils import CommonAttentionState +from vllm.compilation.counter import compilation_counter +from vllm.config import CompilationLevel, VllmConfig +from vllm.core.scheduler import SchedulerOutputs +from vllm.distributed import broadcast_tensor_dict, get_pp_group +from vllm.distributed.kv_transfer import get_kv_transfer_group +from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank, + graph_capture) +from vllm.forward_context import get_forward_context, set_forward_context +from vllm.inputs import INPUT_REGISTRY, InputRegistry +from vllm.logger import init_logger +from vllm.lora.layers import LoRAMapping +from vllm.lora.request import LoRARequest +from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager +from vllm.model_executor import SamplingMetadata, SamplingMetadataCache +from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding +from vllm.model_executor.layers.sampler import (Sampler, SamplerOutput, + get_sampler) +from vllm.model_executor.model_loader import get_model +from vllm.model_executor.model_loader.tensorizer import TensorizerConfig +from vllm.model_executor.models import supports_lora, supports_multimodal +from vllm.model_executor.models.utils import set_cpu_offload_max_bytes +from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, + MultiModalKwargs, MultiModalPlaceholderMap, + MultiModalRegistry) +from vllm.sampling_params import SamplingParams +from vllm.sequence import IntermediateTensors, SequenceGroupMetadata +from vllm.utils import (DeviceMemoryProfiler, GiB_bytes, PyObjectCache, + async_tensor_h2d, flatten_2d_lists, + is_pin_memory_available, supports_dynamo, + weak_ref_tensor) +from vllm.worker.model_runner_base import ( + InputProcessingError, ModelRunnerBase, ModelRunnerInputBase, + ModelRunnerInputBuilderBase, _add_attn_metadata_broadcastable_dict, + _add_sampling_metadata_broadcastable_dict, + _init_attn_metadata_from_tensor_dict, + _init_sampling_metadata_from_tensor_dict) + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionBackend + +logger = init_logger(__name__) + +LORA_WARMUP_RANK = 8 + +_NUM_WARMUP_ITERS = 2 + +TModelInputForGPU = TypeVar('TModelInputForGPU', bound="ModelInputForGPU") + +# For now, bump up cache limits for recompilations during CUDA graph warmups. +torch._dynamo.config.cache_size_limit = 128 +torch._dynamo.config.accumulated_cache_size_limit = 128 + + +@dataclass(frozen=True) +class ModelInputForGPU(ModelRunnerInputBase): + """ + This base class contains metadata needed for the base model forward pass + but not metadata for possible additional steps, e.g., sampling. Model + runners that run additional steps should subclass this method to add + additional fields. + """ + input_tokens: Optional[torch.Tensor] = None + inputs_embeds: Optional[torch.Tensor] = None + input_positions: Optional[torch.Tensor] = None + token_types: Optional[torch.Tensor] = None + seq_lens: Optional[List[int]] = None + query_lens: Optional[List[int]] = None + lora_mapping: Optional["LoRAMapping"] = None + lora_requests: Optional[Set[LoRARequest]] = None + attn_metadata: Optional["AttentionMetadata"] = None + multi_modal_kwargs: Optional[BatchedTensorInputs] = None + request_ids_to_seq_ids: Optional[Dict[str, List[int]]] = None + finished_requests_ids: Optional[List[str]] = None + virtual_engine: int = 0 + async_callback: Optional[Callable] = None + scheduler_outputs: Optional[SchedulerOutputs] = None + previous_hidden_states: Optional[torch.Tensor] = None + + def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: + tensor_dict = { + "input_tokens": self.input_tokens, + "inputs_embeds": self.inputs_embeds, + "input_positions": self.input_positions, + "lora_requests": self.lora_requests, + "lora_mapping": self.lora_mapping, + "multi_modal_kwargs": self.multi_modal_kwargs, + "virtual_engine": self.virtual_engine, + "request_ids_to_seq_ids": self.request_ids_to_seq_ids, + "finished_requests_ids": self.finished_requests_ids, + } + _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) + return tensor_dict + + @classmethod + def from_broadcasted_tensor_dict( + cls: Type[TModelInputForGPU], + tensor_dict: Dict[str, Any], + attn_backend: Optional["AttentionBackend"] = None, + ) -> TModelInputForGPU: + if attn_backend is not None: + tensor_dict = _init_attn_metadata_from_tensor_dict( + attn_backend, tensor_dict) + return cls(**tensor_dict) + + # Exclude `async_callback` to be able to pickle this object + def __getstate__(self): + state = self.__dict__.copy() + del state["async_callback"] + return state + + # TODO: What happens when we depickle this object? + # How can we update this callback to properly pass it to the engine? + def __setstate__(self, state): + self.__dict__.update(state) + self.__dict__.update({'async_callback': None}) + + +@dataclass(frozen=True) +class ModelInputForGPUWithSamplingMetadata(ModelInputForGPU): + """ + Used by the ModelRunner. + """ + sampling_metadata: Optional["SamplingMetadata"] = None + # Used for speculative decoding. We do not broadcast it because it is only + # used by the driver worker. + is_prompt: Optional[bool] = None + + def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: + tensor_dict = { + "input_tokens": self.input_tokens, + "inputs_embeds": self.inputs_embeds, + "input_positions": self.input_positions, + "lora_requests": self.lora_requests, + "lora_mapping": self.lora_mapping, + "multi_modal_kwargs": self.multi_modal_kwargs, + "virtual_engine": self.virtual_engine, + "request_ids_to_seq_ids": self.request_ids_to_seq_ids, + "finished_requests_ids": self.finished_requests_ids, + } + _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) + _add_sampling_metadata_broadcastable_dict(tensor_dict, + self.sampling_metadata) + return tensor_dict + + @classmethod + def from_broadcasted_tensor_dict( + cls, + tensor_dict: Dict[str, Any], + attn_backend: Optional["AttentionBackend"] = None, + ) -> "ModelInputForGPUWithSamplingMetadata": + tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict) + if attn_backend is not None: + tensor_dict = _init_attn_metadata_from_tensor_dict( + attn_backend, tensor_dict) + return cls(**tensor_dict) + + +class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): + """Build ModelInputForGPU from SequenceGroupMetadata.""" + + # Note: ideally we would be using a dataclass(kw_only=True) + # here, so that this can be subclassed easily, + # but kw_only is not supported in python<3.10. + class InterDataForSeqGroup: + """Intermediate data for the current sequence group.""" + + def simple_reinit(self): + self.input_tokens[0].clear() # type: ignore + self.inputs_embeds = None # type: ignore + self.input_positions[0].clear() # type: ignore + self.token_types[0].clear() # type: ignore + self.mrope_input_positions = None # type: ignore + self.seq_lens[0] = 0 # type: ignore + self.orig_seq_lens[0] = 0 # type: ignore + self.prompt_lens[0] = 0 # type: ignore + self.query_lens[0] = 0 # type: ignore + self.context_lens[0] = 0 # type: ignore + self.curr_sliding_window_blocks[0] = 0 # type: ignore + self.lora_index_mapping.clear() # type: ignore + self.lora_prompt_mapping.clear() # type: ignore + self.lora_requests.clear() # type: ignore + + def __init__( + self, + *, + # From sequence group metadata. + request_id: str, + seq_ids: List[int], + is_prompt: bool, + block_tables: Optional[Dict[int, List[int]]], + computed_block_nums: List[int], + n_seqs: int = 0, + + # Input tokens and positions. + input_tokens: Optional[List[List[int]]] = None, + inputs_embeds: Optional[torch.Tensor] = None, + input_positions: Optional[List[List[int]]] = None, + token_types: Optional[List[List[int]]] = None, + mrope_input_positions: Optional[List[List[List[int]]]] = None, + + # The sequence length (may be capped to the sliding window). + seq_lens: Optional[List[int]] = None, + # The original sequence length (before applying sliding window). + # This is used to compute slot mapping. + orig_seq_lens: Optional[List[int]] = None, + # This is used in the dual-chunk flash attention backend. + prompt_lens: Optional[List[int]] = None, + # The query length. + query_lens: Optional[List[int]] = None, + # The number of tokens that are already computed. + context_lens: Optional[List[int]] = None, + # The current sliding window block. + curr_sliding_window_blocks: Optional[List[int]] = None, + + # LoRA inputs. + lora_index_mapping: Optional[List[List[int]]] = None, + lora_prompt_mapping: Optional[List[List[int]]] = None, + lora_requests: Optional[Set[LoRARequest]] = None, + + # Multi-modal inputs. + multi_modal_kwargs: Optional[MultiModalKwargs] = None, + multi_modal_placeholder_maps: Optional[Dict[ + str, MultiModalPlaceholderMap]] = None, + + # Whether the prefix cache is hit (prefill only). + prefix_cache_hit: bool = False, + reinit: bool = False, + reinit_use_defaults: bool = False, + encoder_seq_len: int = 0, + ): + if reinit: + assert len(self.seq_ids) == len(seq_ids) # type: ignore + for i, seq_id in enumerate(seq_ids): + self.seq_ids[i] = seq_id # type: ignore + else: + self.seq_ids = seq_ids + + self.request_id = request_id + self.is_prompt = is_prompt + self.block_tables = block_tables + self.computed_block_nums = computed_block_nums + self.n_seqs = n_seqs + self.encoder_seq_len = encoder_seq_len + + if reinit: + if len(self.seq_ids) == 1 and reinit_use_defaults: + self.simple_reinit() + else: + if input_tokens: + self.input_tokens = input_tokens + else: + for seq_id in range(len(self.seq_ids)): + self.input_tokens[seq_id].clear() + + self.inputs_embeds = inputs_embeds + + if input_positions: + self.input_positions = input_positions + else: + for seq_id in range(len(self.seq_ids)): + self.input_positions[seq_id].clear() + + if token_types: + self.token_types = token_types + else: + for seq_id in range(len(self.seq_ids)): + self.token_types[seq_id].clear() + + self.mrope_input_positions = None + + if seq_lens: + self.seq_lens = seq_lens + else: + for seq_id in range(len(self.seq_ids)): + self.seq_lens[seq_id] = 0 + + if orig_seq_lens: + self.orig_seq_lens = orig_seq_lens + else: + for seq_id in range(len(self.seq_ids)): + self.orig_seq_lens[seq_id] = 0 + + if prompt_lens: + self.prompt_lens = prompt_lens + else: + for seq_id in range(len(self.seq_ids)): + self.prompt_lens[seq_id] = 0 + + if query_lens: + self.query_lens = query_lens + else: + for seq_id in range(len(self.seq_ids)): + self.query_lens[seq_id] = 0 + + if context_lens: + self.context_lens = context_lens + else: + for seq_id in range(len(self.seq_ids)): + self.context_lens[seq_id] = 0 + + if curr_sliding_window_blocks: + self.curr_sliding_window_blocks = \ + curr_sliding_window_blocks + else: + for seq_id in range(len(self.seq_ids)): + self.curr_sliding_window_blocks[seq_id] = 0 + + if lora_index_mapping: + self.lora_index_mapping = lora_index_mapping + else: + self.lora_index_mapping.clear() + + if lora_prompt_mapping: + self.lora_prompt_mapping = lora_prompt_mapping + else: + self.lora_prompt_mapping.clear() + + if lora_requests: + self.lora_requests = lora_requests + else: + self.lora_requests.clear() + + else: + self.input_tokens = input_tokens or [] + self.inputs_embeds = inputs_embeds + self.input_positions = input_positions or [] + self.token_types = token_types or [] + self.mrope_input_positions = mrope_input_positions or None + self.seq_lens = seq_lens or [] + self.orig_seq_lens = orig_seq_lens or [] + self.prompt_lens = prompt_lens or [] + self.query_lens = query_lens or [] + self.context_lens = context_lens or [] + self.curr_sliding_window_blocks = \ + curr_sliding_window_blocks or [] + + self.lora_index_mapping = lora_index_mapping or [] + self.lora_prompt_mapping = lora_prompt_mapping or [] + self.lora_requests = lora_requests or set() + + self.multi_modal_kwargs = multi_modal_kwargs + self.multi_modal_placeholder_maps = multi_modal_placeholder_maps + self.prefix_cache_hit = prefix_cache_hit + + self.n_seqs = len(self.seq_ids) + + if not reinit: + self.__post_init__() + + def __post_init__(self): + self.n_seqs = len(self.seq_ids) + + self.input_tokens = [[] for _ in range(self.n_seqs)] + self.input_positions = [[] for _ in range(self.n_seqs)] + self.token_types = [[] for _ in range(self.n_seqs)] + self.mrope_input_positions = None + self.seq_lens = [0] * self.n_seqs + self.orig_seq_lens = [0] * self.n_seqs + self.prompt_lens = [0] * self.n_seqs + self.query_lens = [0] * self.n_seqs + self.context_lens = [0] * self.n_seqs + self.curr_sliding_window_blocks = [0] * self.n_seqs + + self.lora_index_mapping = [] + self.lora_prompt_mapping = [] + + def __repr__(self) -> str: + return (f"InterDataForSeqGroup(" + f"request_id={self.request_id}, " + f"seq_ids={self.seq_ids}, " + f"is_prompt={self.is_prompt}, " + f"block_tables={self.block_tables}, " + f"computed_block_nums={self.computed_block_nums}, " + f"n_seqs={self.n_seqs}, " + f"input_tokens={self.input_tokens}, " + f"inputs_embeds.shape=" + f"{getattr(self.inputs_embeds, 'shape', None)}, " + f"input_positions={self.input_positions}, " + f"token_types={self.token_types}, " + f"mrope_input_positions={self.mrope_input_positions}, " + f"seq_lens={self.seq_lens}, " + f"orig_seq_lens={self.orig_seq_lens}, " + f"query_lens={self.query_lens}, " + f"context_lens={self.context_lens}, " + f"multi_modal_kwargs={self.multi_modal_kwargs}") + + def gen_inter_data_builder(self, num_seqs: int): + return lambda: ModelInputForGPUBuilder.InterDataForSeqGroup( + request_id="", + seq_ids=[0] * num_seqs, + is_prompt=True, + block_tables=None, + computed_block_nums=[]) + + def init_cached_inter_data(self, *args, **kwargs): + assert len(args) == 0 + assert "seq_ids" in kwargs + seq_ids = kwargs["seq_ids"] + num_seqs = len(seq_ids) + + # The inter-data cache is per model_runner + inter_data_cache = self.runner.inter_data_cache + if num_seqs not in inter_data_cache: + inter_data_cache[num_seqs] = PyObjectCache( + self.gen_inter_data_builder(num_seqs)) + + obj = inter_data_cache[num_seqs].get_object() + obj.__init__(*args, **kwargs) + return obj + + def reset_cached_inter_data(self): + for cache in self.runner.inter_data_cache.values(): + cache.reset() + + def __init__(self, + runner: "GPUModelRunnerBase", + finished_requests_ids: Optional[List[str]] = None): + super().__init__() + # Compute functions for each sequence in a sequence group. + # WARNING: The order of the functions matters! + self.per_seq_compute_fns = [ + self._compute_lens, + self._compute_for_prefix_cache_hit, + self._compute_for_sliding_window, + self._compute_lora_input, + ] + # Compute functions for each sequence group. + # WARNING: The order of the functions matters! + self.per_seq_group_compute_fns = [ + self._compute_multi_modal_input, + ] + + self.runner = runner + self.model_input_cls = self.runner._model_input_cls + self.attn_backend = self.runner.attn_backend + self.scheduler_config = self.runner.scheduler_config + self.sliding_window = self.runner.sliding_window + self.block_size = self.runner.block_size + self.enable_lora = self.runner.lora_config is not None + + # Attention metadata inputs. + if self.attn_backend is not None: + # spec decode (e.g. Medusa) does not have atten backend + self.attn_metadata_builder = self.attn_backend.get_builder_cls()( + weakref.proxy(self)) + + # Engine/Model configurations. + self.chunked_prefill_enabled = ( + self.scheduler_config is not None + and self.scheduler_config.chunked_prefill_enabled) + if self.sliding_window is not None: + self.sliding_window_blocks = ( + self.sliding_window + self.block_size - 1) // self.block_size + self.block_aligned_sliding_window = \ + self.sliding_window_blocks * self.block_size + + def prepare(self, + finished_requests_ids: Optional[List[str]] = None) -> None: + self.finished_requests_ids = finished_requests_ids + + # if the current batch is decode-only. + # will be set to False if there is any non-decode request. + self.decode_only = True + + # Intermediate data (data in CPU before going to GPU) for + # the current sequence group. + self.inter_data_list: List[ + ModelInputForGPUBuilder.InterDataForSeqGroup] = [] + + self.attn_metadata_builder.prepare() + + def _compute_lens(self, inter_data: InterDataForSeqGroup, seq_idx: int, + seq_group_metadata: SequenceGroupMetadata): + """Compute context length, sequence length and tokens + for the given sequence data. + """ + seq_data = seq_group_metadata.seq_data[inter_data.seq_ids[seq_idx]] + token_chunk_size = seq_group_metadata.token_chunk_size + + # Compute context length (the number of tokens that are + # already computed) and sequence length (total number of tokens). + + seq_len = seq_data.get_len() + if inter_data.is_prompt: + context_len = seq_data.get_num_computed_tokens() + seq_len = min(seq_len, context_len + token_chunk_size) + elif self.runner.model_config.is_encoder_decoder: + context_len = seq_len - 1 + else: + context_len = seq_data.get_num_computed_tokens() + + # Compute tokens. + if seq_data.prompt_embeds is None: + tokens = seq_data.get_token_ids()[context_len:seq_len] + prompt_embeds = None + else: + tokens = [0] * (seq_len - context_len) + prompt_embeds = seq_data.get_token_embeddings( + )[context_len:seq_len] + + token_types = seq_group_metadata.token_type_ids + + inter_data.seq_lens[seq_idx] = seq_len + inter_data.orig_seq_lens[seq_idx] = seq_len + inter_data.prompt_lens[seq_idx] = seq_data.get_prompt_len() + inter_data.context_lens[seq_idx] = context_len + inter_data.input_tokens[seq_idx].extend(tokens) + inter_data.inputs_embeds = prompt_embeds + inter_data.input_positions[seq_idx].extend(range(context_len, seq_len)) + inter_data.token_types[seq_idx].extend( + token_types if token_types else []) + inter_data.query_lens[seq_idx] = seq_len - context_len + + if seq_data.mrope_position_delta is not None: + if inter_data.mrope_input_positions is None: + inter_data.mrope_input_positions = [None] * inter_data.n_seqs + + inter_data.mrope_input_positions[ + seq_idx] = MRotaryEmbedding.get_next_input_positions( + seq_data.mrope_position_delta, + context_len, + seq_len, + ) + + def _compute_for_prefix_cache_hit( + self, inter_data: InterDataForSeqGroup, seq_idx: int, + seq_group_metadata: SequenceGroupMetadata): + """Check if hit prefix cache (i.e., some blocks are already computed). + If hit, update input tokens and positions to only compute the + remaining blocks. + """ + computed_block_nums = inter_data.computed_block_nums + + # Note that prefix caching does not support sliding window. + prefix_cache_hit = (computed_block_nums is not None + and len(computed_block_nums) > 0 + and self.sliding_window is None + and inter_data.is_prompt) + inter_data.prefix_cache_hit = prefix_cache_hit + + if not prefix_cache_hit: + return + + assert computed_block_nums is not None + # The cache hit prompt tokens in this sequence. Note that + # this may be larger than the sequence length if chunked + # prefill is enabled. + prefix_cache_len = len(computed_block_nums) * self.block_size + seq_group_metadata.seq_data[inter_data.seq_ids[ + seq_idx]].update_num_cached_tokens(prefix_cache_len) + + # The number of so far computed prompt tokens in this sequence. + context_len = inter_data.context_lens[seq_idx] + # The total number of prompt tokens in this sequence. + # When chunked prefill is enabled, this is the token number of + # computed chunks + current chunk. + seq_len = inter_data.seq_lens[seq_idx] + if prefix_cache_len <= context_len: + # We already passed the cache hit region, + # so do normal computation. + pass + elif context_len < prefix_cache_len < seq_len: + # Partial hit. Compute the missing part. + uncomputed_start = prefix_cache_len - context_len + inter_data.input_tokens[seq_idx] = inter_data.input_tokens[ + seq_idx][uncomputed_start:] + inter_data.input_positions[seq_idx] = inter_data.input_positions[ + seq_idx][uncomputed_start:] + inter_data.token_types[seq_idx] = inter_data.token_types[seq_idx][ + uncomputed_start:] + context_len = prefix_cache_len + + inter_data.context_lens[seq_idx] = context_len + inter_data.query_lens[ + seq_idx] = inter_data.seq_lens[seq_idx] - context_len + elif seq_len <= prefix_cache_len: + # Full hit. Only compute the last token to avoid + # erroneous behavior. FIXME: Ideally we should directly + # mark all tokens as computed in the scheduler and do not + # schedule this sequence, so this case should not happen. + inter_data.input_tokens[seq_idx] = inter_data.input_tokens[ + seq_idx][-1:] + inter_data.input_positions[seq_idx] = inter_data.input_positions[ + seq_idx][-1:] + inter_data.token_types[seq_idx] = inter_data.token_types[seq_idx][ + -1:] + inter_data.query_lens[seq_idx] = 1 + inter_data.context_lens[seq_idx] = inter_data.seq_lens[seq_idx] - 1 + + def _compute_for_sliding_window(self, inter_data: InterDataForSeqGroup, + seq_idx: int, + seq_group_metadata: SequenceGroupMetadata): + """Update seq_len and curr_sliding_window_block for the given + sequence data (only required by decoding) if sliding window is enabled. + """ + curr_sliding_window_block = 0 + sliding_seq_len = inter_data.seq_lens[seq_idx] + if not inter_data.is_prompt and self.sliding_window is not None: + # TODO(sang): This is a hack to make sliding window work with + # paged attn. We can remove it if we make paged attn kernel + # to properly handle slinding window attn. + curr_sliding_window_block = self.sliding_window_blocks + # number of elements in last block + suff_len = inter_data.seq_lens[seq_idx] % self.block_size + sliding_seq_len = min(inter_data.seq_lens[seq_idx], + self.block_aligned_sliding_window + suff_len) + if suff_len > 0: + curr_sliding_window_block += 1 + + inter_data.curr_sliding_window_blocks[ + seq_idx] = curr_sliding_window_block + inter_data.seq_lens[seq_idx] = sliding_seq_len + + def _compute_lora_input(self, inter_data: InterDataForSeqGroup, + seq_idx: int, + seq_group_metadata: SequenceGroupMetadata): + """If LoRA is enabled, compute LoRA index and prompt mapping.""" + if not self.enable_lora: + return + + lora_id = seq_group_metadata.lora_int_id + if lora_id > 0: + inter_data.lora_requests.add(seq_group_metadata.lora_request) + query_len = inter_data.query_lens[seq_idx] + inter_data.lora_index_mapping.append([lora_id] * query_len) + sampling_params = seq_group_metadata.sampling_params + if sampling_params and sampling_params.prompt_logprobs is not None: + inter_data.lora_prompt_mapping.append([lora_id] * query_len) + elif not self.chunked_prefill_enabled or seq_group_metadata.do_sample: + inter_data.lora_prompt_mapping.append([lora_id]) + else: + inter_data.lora_prompt_mapping.append([]) + + def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, + seq_group_metadata: SequenceGroupMetadata): + """If multi-modal data is given, add it to the input.""" + # NOTE: mm_kwargs only includes the subset of multi-modal items that + # intersect with the current prefill positions. + positions = inter_data.input_positions[0] + mm_kwargs, placeholder_maps = MultiModalPlaceholderMap.from_seq_group( + seq_group_metadata, + range(positions[0], positions[0] + len(positions))) + + # M-RoPE requires mrope_positions even for plain text; return early + # when mm_kwargs is empty only if inter_data.is_prompt is False. + if not mm_kwargs and not inter_data.is_prompt: + return + + inter_data.multi_modal_kwargs = mm_kwargs + inter_data.multi_modal_placeholder_maps = placeholder_maps + + # special processing for mrope position deltas. + if self.runner.model_config.uses_mrope: + image_grid_thw = mm_kwargs.get("image_grid_thw", None) + video_grid_thw = mm_kwargs.get("video_grid_thw", None) + audio_feature_lengths = mm_kwargs.get("audio_feature_lengths", + None) + + second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None) + use_audio_in_video = mm_kwargs.get("use_audio_in_video", False) + hf_config = self.runner.model_config.hf_config + + inter_data.mrope_input_positions = [None] * inter_data.n_seqs + for seq_idx in range(inter_data.n_seqs): + seq_data = seq_group_metadata.seq_data[ + inter_data.seq_ids[seq_idx]] + token_ids = seq_data.get_token_ids() + + mrope_input_positions, mrope_position_delta = \ + MRotaryEmbedding.get_input_positions( + token_ids, + hf_config=hf_config, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + context_len=inter_data.context_lens[seq_idx], + seq_len=inter_data.seq_lens[seq_idx], + audio_feature_lengths=audio_feature_lengths, + use_audio_in_video=use_audio_in_video, + ) + + seq_data.mrope_position_delta = mrope_position_delta + inter_data.mrope_input_positions[ + seq_idx] = mrope_input_positions + + def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): + """Add a sequence group to the builder.""" + seq_ids = seq_group_metadata.seq_data.keys() + n_seqs = len(seq_ids) + is_prompt = seq_group_metadata.is_prompt + + if is_prompt: + assert n_seqs == 1 + self.decode_only = False + + encoder_seq_len = 0 + + if self.runner.model_config.is_encoder_decoder: + encoder_seq_len = seq_group_metadata.encoder_seq_data.get_len() + + inter_data = self.init_cached_inter_data( + request_id=seq_group_metadata.request_id, + seq_ids=seq_ids, + is_prompt=is_prompt, + block_tables=seq_group_metadata.block_tables, + computed_block_nums=seq_group_metadata.computed_block_nums, + reinit=True, + reinit_use_defaults=True, + encoder_seq_len=encoder_seq_len) + + self.inter_data_list.append(inter_data) + + for seq_idx in range(n_seqs): + for per_seq_fn in self.per_seq_compute_fns: + per_seq_fn(inter_data, seq_idx, seq_group_metadata) + for per_seq_group_fn in self.per_seq_group_compute_fns: + per_seq_group_fn(inter_data, seq_group_metadata) + + def _use_captured_graph(self, + batch_size: int, + decode_only: bool, + max_decode_seq_len: int, + max_encoder_seq_len: int = 0) -> bool: + return (decode_only and not self.runner.model_config.enforce_eager + and max_decode_seq_len <= self.runner.max_seq_len_to_capture + and max_encoder_seq_len <= self.runner.max_seq_len_to_capture + and batch_size <= self.runner.max_batchsize_to_capture) + + def _get_cuda_graph_pad_size(self, + num_seqs: int, + max_decode_seq_len: int, + max_encoder_seq_len: int = 0) -> int: + """ + Determine the number of padding sequences required for running in + CUDA graph mode. Returns -1 if CUDA graphs cannot be used. + + In the multi-step + chunked-prefill case, only the first step + has Prefills (if any). The rest of the steps are guaranteed to be all + decodes. In this case, we set up the padding as if all the sequences + are decodes so we may run all steps except the first step in CUDA graph + mode. + + Args: + num_seqs (int): Number of sequences scheduled to run. + max_decode_seq_len (int): Greatest of all the decode sequence + lengths. Used only in checking the viablility of using + CUDA graphs. + max_encoder_seq_len (int, optional): Greatest of all the encode + sequence lengths. Defaults to 0. Used only in checking the + viability of using CUDA graphs. + Returns: + int: Returns the determined number of padding sequences. If + CUDA graphs is not viable, returns -1. + """ + decode_only = self.decode_only + if not decode_only: + # Early exit so we can treat num_seqs as the batch_size below. + return -1 + + # batch_size out of this function refers to the number of input + # tokens being scheduled. This conflation of num_seqs as batch_size + # is valid as this is a decode-only case. + batch_size = num_seqs + if not self._use_captured_graph(batch_size, decode_only, + max_decode_seq_len, + max_encoder_seq_len): + return -1 + + graph_batch_size = self.runner.vllm_config.pad_for_cudagraph( + batch_size) + assert graph_batch_size >= batch_size + return graph_batch_size - batch_size + + def build(self) -> ModelInputForGPU: + """Finalize the builder intermediate data and + create on-device tensors. + """ + # Combine and flatten intermediate data. + input_tokens = list[int]() + inputs_embeds_list = list[torch.Tensor]() + token_types = list[int]() + for inter_data in self.inter_data_list: + for cur_input_tokens in inter_data.input_tokens: + input_tokens.extend(cur_input_tokens) + for cur_token_types in inter_data.token_types: + token_types.extend(cur_token_types) + if inter_data.inputs_embeds is not None: + inputs_embeds_list.append( + inter_data.inputs_embeds.to( + dtype=self.runner.model_config.dtype, + device=self.runner.device)) + inputs_embeds: Optional[torch.Tensor] + if len(inputs_embeds_list) == 0: + inputs_embeds = None + else: + inputs_embeds = torch.cat(inputs_embeds_list, dim=0).to( + dtype=self.runner.model_config.dtype, + device=self.runner.device) + assert len(inputs_embeds) == len(input_tokens) + + if not input_tokens and inputs_embeds is None: + # This may happen when all prefill requests hit + # prefix caching and there is no decode request. + return self.model_input_cls() + + mrope_input_positions: Optional[List[List[int]]] = None + if any(inter_data.mrope_input_positions is not None + for inter_data in self.inter_data_list): + mrope_input_positions = [[] for _ in range(3)] + for idx in range(3): + for inter_data in self.inter_data_list: + msections = inter_data.mrope_input_positions + if msections is None: + for _seq_input_positions in inter_data.input_positions: + mrope_input_positions[idx].extend( + _seq_input_positions) + else: + for _seq_mrope_input_positions in msections: + mrope_input_positions[idx].extend( + _seq_mrope_input_positions[idx]) + input_positions = None + else: + input_positions = [] + for inter_data in self.inter_data_list: + for cur_input_positions in inter_data.input_positions: + input_positions.extend(cur_input_positions) + + seq_lens = [] + query_lens = [] + max_decode_seq_len = 0 + max_encoder_seq_len = 0 + for inter_data in self.inter_data_list: + seq_lens.extend(inter_data.seq_lens) + query_lens.extend(inter_data.query_lens) + if not inter_data.is_prompt: + max_decode_seq_len = max(max_decode_seq_len, + max(inter_data.seq_lens)) + if self.runner.model_config.is_encoder_decoder: + max_encoder_seq_len = max(max_encoder_seq_len, + inter_data.encoder_seq_len) + + # Mapping from request IDs to sequence IDs. Used for Jamba models + # that manages the cache by itself. + request_ids_to_seq_ids = { + data.request_id: data.seq_ids + for data in self.inter_data_list + } + + cuda_graph_pad_size = self._get_cuda_graph_pad_size( + num_seqs=len(seq_lens), + max_decode_seq_len=max_decode_seq_len, + max_encoder_seq_len=max_encoder_seq_len) + + batch_size = len(input_tokens) + if cuda_graph_pad_size != -1: + # If cuda graph can be used, pad tensors accordingly. + # See `capture_model` API for more details. + # vLLM uses cuda graph only for decoding requests. + batch_size += cuda_graph_pad_size + + # Tokens and positions. + if cuda_graph_pad_size: + input_tokens.extend(itertools.repeat(0, cuda_graph_pad_size)) + assert self.runner.device is not None + input_tokens_tensor = async_tensor_h2d(input_tokens, torch.long, + self.runner.device, + self.runner.pin_memory) + + token_types_tensor = async_tensor_h2d(token_types, torch.long, + self.runner.device, + self.runner.pin_memory) \ + if token_types else None + + if mrope_input_positions is not None: + for idx in range(3): + mrope_input_positions[idx].extend( + itertools.repeat(0, cuda_graph_pad_size)) + input_positions_tensor = async_tensor_h2d(mrope_input_positions, + torch.long, + self.runner.device, + self.runner.pin_memory) + else: + input_positions.extend(itertools.repeat(0, cuda_graph_pad_size)) + input_positions_tensor = async_tensor_h2d(input_positions, + torch.long, + self.runner.device, + self.runner.pin_memory) + # Sequence and query lengths. + if cuda_graph_pad_size: + seq_lens.extend(itertools.repeat(1, cuda_graph_pad_size)) + + # Attention metadata. + attn_metadata = self.attn_metadata_builder.build( + seq_lens, query_lens, cuda_graph_pad_size, batch_size) + + # LoRA data. + lora_requests = set() + lora_mapping = None + if self.enable_lora: + lora_requests = set(r for data in self.inter_data_list + for r in data.lora_requests) + lora_index_mapping = flatten_2d_lists([ + flatten_2d_lists(inter_data.lora_index_mapping) + for inter_data in self.inter_data_list + ]) + if cuda_graph_pad_size: + lora_index_mapping.extend( + itertools.repeat(0, cuda_graph_pad_size)) + lora_prompt_mapping = flatten_2d_lists([ + flatten_2d_lists(inter_data.lora_prompt_mapping) + for inter_data in self.inter_data_list + ]) + + lora_mapping = LoRAMapping( + **dict(index_mapping=lora_index_mapping, + prompt_mapping=lora_prompt_mapping, + is_prefill=not self.decode_only)) + + # Multi-modal data. + multi_modal_kwargs_list = [ + data.multi_modal_kwargs for data in self.inter_data_list + if data.multi_modal_kwargs is not None + ] + multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) + + return self.model_input_cls( + input_tokens=input_tokens_tensor, + inputs_embeds=inputs_embeds, + input_positions=input_positions_tensor, + token_types=token_types_tensor, + attn_metadata=attn_metadata, + seq_lens=seq_lens, + query_lens=query_lens, + lora_mapping=lora_mapping, + lora_requests=lora_requests, + multi_modal_kwargs=multi_modal_kwargs, + request_ids_to_seq_ids=request_ids_to_seq_ids, + finished_requests_ids=self.finished_requests_ids) + + +class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): + """ + Helper class for shared methods between GPU model runners. + """ + _model_input_cls: Type[TModelInputForGPU] + _builder_cls: Type[ModelInputForGPUBuilder] + builder: ModelInputForGPUBuilder + + def __init__( + self, + vllm_config: VllmConfig, + kv_cache_dtype: Optional[str] = "auto", + is_driver_worker: bool = False, + return_hidden_states: bool = False, + input_registry: InputRegistry = INPUT_REGISTRY, + mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, + ): + + ModelRunnerBase.__init__(self, vllm_config) + model_config = self.model_config + cache_config = self.cache_config + + self.is_driver_worker = is_driver_worker + self.return_hidden_states = return_hidden_states + + self.device = self.device_config.device + self.pin_memory = is_pin_memory_available() + + self.kv_cache_dtype = kv_cache_dtype + self.sliding_window = model_config.get_sliding_window() + self.block_size = cache_config.block_size + self.max_seq_len_to_capture = self.model_config.max_seq_len_to_capture + self.max_batchsize_to_capture = \ + self.vllm_config.compilation_config.max_capture_size + + # + self.graph_runners: List[Dict[Tuple[int, bool], CUDAGraphRunner]] = [ + {} for _ in range(self.parallel_config.pipeline_parallel_size) + ] + self.graph_memory_pool: Optional[Tuple[ + int, int]] = None # Set during graph capture. + + self.has_inner_state = model_config.has_inner_state + + self.in_profile_run = False + + # When using CUDA graph, the input block tables must be padded to + # max_seq_len_to_capture. However, creating the block table in + # Python can be expensive. To optimize this, we cache the block table + # in numpy and only copy the actual input content at every iteration. + # The shape of the cached block table will be + # (max batch size to capture, max seq len to capture / block size). + self.graph_block_tables = np.zeros( + (self.max_batchsize_to_capture, self.get_max_block_per_batch()), + dtype=np.int32) + + self.cross_layer_shared_graph_block_tables = np.zeros( + (self.max_batchsize_to_capture, self.get_max_block_per_batch()), + dtype=np.int32) + + # Attention-free but stateful models like Mamba need a placeholder attn + # backend, as the attention metadata is needed to manage internal state. + # However we must bypass attention selection altogether for some models + # used for speculative decoding to avoid a divide-by-zero in + # model_config.get_head_size() + num_attn_heads = self.model_config.get_num_attention_heads( + self.parallel_config) + needs_attn_backend = (num_attn_heads != 0 + or self.model_config.is_attention_free) + + self.attn_backend = get_attn_backend( + self.model_config.get_head_size(), + self.model_config.dtype, + self.kv_cache_dtype, + self.block_size, + self.model_config.is_attention_free, + use_mla=self.model_config.use_mla, + ) if needs_attn_backend else None + if self.attn_backend: + self.attn_state = self.attn_backend.get_state_cls()( + weakref.proxy(self)) + else: + self.attn_state = CommonAttentionState(weakref.proxy(self)) + + # Multi-modal data support + self.input_registry = input_registry + self.mm_registry = mm_registry + + # Lazy initialization + self.model: nn.Module # Set after load_model + # Set after load_model. + self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None + self.sampler = get_sampler() + + set_cpu_offload_max_bytes( + int(self.cache_config.cpu_offload_gb * 1024**3)) + + # Used to cache python objects + self.inter_data_cache: Dict[int, PyObjectCache] = {} + + # Using the PythonizationCache in Pipeline-Parallel clobbers the + # SequenceGroupToSample object. In Pipeline-Parallel, we have + # more than 1 Scheduler, resulting in a potential back-to-back + # prepare_model_inputs() call. This clobbers the cached + # SequenceGroupToSample objects, as we reset the cache during + # every prepare_model_inputs() call. + self.sampling_metadata_cache: SamplingMetadataCache = \ + SamplingMetadataCache() \ + if self.parallel_config.pipeline_parallel_size == 1 else None + + if hasattr(self, "_builder_cls"): + # multi-step model runner does not have `_builder_cls` + self.builder = self._builder_cls(weakref.proxy(self)) + + def load_model(self) -> None: + logger.info("Starting to load model %s...", self.model_config.model) + with DeviceMemoryProfiler(self.device) as m: + time_before_load = time.perf_counter() + self.model = get_model(vllm_config=self.vllm_config) + if self.lora_config: + assert supports_lora( + self.model + ), f"{self.model.__class__.__name__} does not support LoRA yet." + + if supports_multimodal(self.model): + logger.warning( + "Regarding multimodal models, vLLM currently " + "only supports adding LoRA to language model.") + + # Use get_text_config() in case of multimodal models + text_config = self.model_config.hf_config.get_text_config() + + self.lora_manager = LRUCacheWorkerLoRAManager( + self.scheduler_config.max_num_seqs, + self.scheduler_config.max_num_batched_tokens, + self.vocab_size, + self.lora_config, + self.device, + self.model.embedding_modules, + self.model.embedding_padding_modules, + max_position_embeddings=text_config. + max_position_embeddings, + ) + self.model = self.lora_manager.create_lora_manager(self.model) + time_after_load = time.perf_counter() + + self.model_memory_usage = m.consumed_memory + logger.info("Model loading took %.4f GiB and %.6f seconds", + self.model_memory_usage / GiB_bytes, + time_after_load - time_before_load) + + + if self.vllm_config.compilation_config.level ==\ + CompilationLevel.DYNAMO_AS_IS and supports_dynamo(): + backend = self.vllm_config.compilation_config.init_backend( + self.vllm_config) + compilation_counter.dynamo_as_is_count += 1 + self.model = torch.compile( + self.model, + fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, + backend=backend) + + def get_model(self) -> nn.Module: + return self.model + + def save_sharded_state( + self, + path: str, + pattern: Optional[str] = None, + max_size: Optional[int] = None, + ) -> None: + from vllm.model_executor.model_loader import ShardedStateLoader + ShardedStateLoader.save_model( + self.model, + path, + pattern=pattern, + max_size=max_size, + ) + + def save_tensorized_model( + self, + tensorizer_config: TensorizerConfig, + ) -> None: + from vllm.model_executor.model_loader import TensorizerLoader + TensorizerLoader.save_model( + self.model, + tensorizer_config=tensorizer_config, + model_config=self.model_config, + ) + + def get_max_block_per_batch(self) -> int: + block_size = self.block_size + return (self.max_seq_len_to_capture + block_size - 1) // block_size + + def _prepare_model_input_tensors( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + finished_requests_ids: Optional[List[str]] = None + ) -> TModelInputForGPU: + """Helper method to prepare the model input based on a given sequence + group. Prepares metadata needed for the base model forward pass but not + metadata for possible additional steps, e.g., sampling. + + The API assumes seq_group_metadata_list is sorted by prefill -> decode. + + The result tensors and data structure also batches input in prefill + -> decode order. For example, + + - input_tokens[:num_prefill_tokens] contains prefill tokens. + - input_tokens[num_prefill_tokens:] contains decode tokens. + + If cuda graph is required, this API automatically pads inputs. + """ + self.builder.prepare(finished_requests_ids) + for seq_group_metadata in seq_group_metadata_list: + try: + self.builder.add_seq_group(seq_group_metadata) + except Exception as e: + # Raise an exception that tracks the ID of the bad request + raise InputProcessingError(seq_group_metadata.request_id, + str(e)) from e + + self.builder.reset_cached_inter_data() + + return self.builder.build() # type: ignore + + @contextmanager + def set_in_profile_run(self): + self.in_profile_run = True + try: + yield + finally: + self.in_profile_run = False + + @torch.inference_mode() + def profile_run(self) -> None: + max_num_batched_tokens = \ + self.scheduler_config.max_num_batched_tokens + max_num_seqs = self.scheduler_config.max_num_seqs + self._dummy_run(max_num_batched_tokens, max_num_seqs) + + def _add_dummy_loras(self, num_loras: int) -> list[LoRARequest]: + assert num_loras > 0 + assert self.lora_manager is not None + + dummy_lora_requests: list[LoRARequest] = [] + with self.lora_manager.dummy_lora_cache(): + for idx in range(num_loras): + lora_id = idx + 1 + dummy_lora_request = LoRARequest( + lora_name=f"warmup_{lora_id}", + lora_int_id=lora_id, + lora_path="/not/a/real/path", + ) + self.lora_manager.add_dummy_lora(dummy_lora_request, + rank=LORA_WARMUP_RANK) + dummy_lora_requests.append(dummy_lora_request) + return dummy_lora_requests + + def _remove_dummy_loras(self): + # Remove dummy loras. + assert self.lora_manager is not None + self.remove_all_loras() + + def _dummy_run(self, + max_num_batched_tokens: int, + max_num_seqs: int = 1) -> None: + with self.set_in_profile_run(): + # Enable top-k sampling to reflect the accurate memory usage. + sampling_params = \ + SamplingParams(top_p=0.99, top_k=self.vocab_size - 1) + + # This represents the maximum number of different requests + # that will have unique loras, and therefore the max amount of + # memory consumption. Create dummy lora request copies from the + # lora request passed in, which contains a lora from the lora + # warmup path. + dummy_lora_requests: List[LoRARequest] = [] + dummy_lora_requests_per_seq: List[LoRARequest] = [] + if self.lora_config: + dummy_lora_requests = self._add_dummy_loras( + self.lora_config.max_loras) + assert len(dummy_lora_requests) == self.lora_config.max_loras + dummy_lora_requests_per_seq = [ + dummy_lora_requests[idx % len(dummy_lora_requests)] + for idx in range(max_num_seqs) + ] + + # Profile memory usage with max_num_sequences sequences and the + # total number of tokens equal to max_num_batched_tokens. + seqs: List[SequenceGroupMetadata] = [] + # Additional GPU memory may be needed for multi-modal encoding, + # which needs to be accounted for when calculating the GPU blocks + # for vLLM blocker manager. + # To exercise the worst scenario for GPU memory consumption, + # the number of seqs (batch_size) is chosen to maximize the number + # of images processed. + + max_mm_tokens = self.mm_registry.get_max_multimodal_tokens( + self.model_config) + if max_mm_tokens > 0: + max_num_seqs_orig = max_num_seqs + max_num_seqs = min(max_num_seqs, + max_num_batched_tokens // max_mm_tokens) + if max_num_seqs < 1: + expr = (f"min({max_num_seqs_orig}, " + f"{max_num_batched_tokens} // {max_mm_tokens})") + logger.warning( + "Computed max_num_seqs (%s) to be less than 1. " + "Setting it to the minimum value of 1.", expr) + max_num_seqs = 1 + + batch_size = 0 + for group_id in range(max_num_seqs): + seq_len = (max_num_batched_tokens // max_num_seqs + + (group_id < max_num_batched_tokens % max_num_seqs)) + batch_size += seq_len + + dummy_data = self.input_registry \ + .dummy_data_for_profiling(self.model_config, + seq_len, + self.mm_registry) + + seq = SequenceGroupMetadata( + request_id=str(group_id), + is_prompt=True, + seq_data={group_id: dummy_data.seq_data}, + sampling_params=sampling_params, + block_tables=None, + lora_request=dummy_lora_requests_per_seq[group_id] + if dummy_lora_requests_per_seq else None, + multi_modal_data=dummy_data.multi_modal_data, + multi_modal_placeholders=dummy_data. + multi_modal_placeholders, + ) + seqs.append(seq) + + # Run the model with the dummy inputs. + num_layers = self.model_config.get_num_layers(self.parallel_config) + # use an empty tensor instead of `None`` to force Dynamo to pass + # it by reference, rather by specializing on the value ``None``. + # the `dtype` argument does not matter, and we use `float32` as + # a placeholder (it has wide hardware support). + # it is important to create tensors inside the loop, rather than + # multiplying the list, to avoid Dynamo from treating them as + # tensor aliasing. + kv_caches = [ + torch.tensor([], dtype=torch.float32, device=self.device) + for _ in range(num_layers) + ] + finished_requests_ids = [seq.request_id for seq in seqs] + model_input = self.prepare_model_input( + seqs, finished_requests_ids=finished_requests_ids) + intermediate_tensors = None + if not get_pp_group().is_first_rank: + intermediate_tensors = \ + self.model.make_empty_intermediate_tensors( + batch_size=batch_size, + dtype=self.model_config.dtype, + device=self.device) + + # Disable KV Scale Calculation for dummy data during profile run + if model_input.attn_metadata is not None: + model_input.attn_metadata.enable_kv_scales_calculation = False + + self.execute_model(model_input, kv_caches, intermediate_tensors) + torch.cuda.synchronize() + if self.lora_config: + self._remove_dummy_loras() + + return + + def remove_all_loras(self): + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + self.lora_manager.remove_all_adapters() + + def set_active_loras(self, lora_requests: Set[LoRARequest], + lora_mapping: LoRAMapping) -> None: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + self.lora_manager.set_active_adapters(lora_requests, lora_mapping) + + def add_lora(self, lora_request: LoRARequest) -> bool: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.add_adapter(lora_request) + + def remove_lora(self, lora_id: int) -> bool: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.remove_adapter(lora_id) + + def pin_lora(self, lora_id: int) -> bool: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.pin_adapter(lora_id) + + def list_loras(self) -> Set[int]: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.list_adapters() + + @torch.inference_mode() + def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: + """Cuda graph capture a model. + + Note that CUDA graph's performance gain is negligible if number + of batched tokens are larger than 200. And since CUDA graph + requires fixed sized tensors, supporting large/variable batch + size requires high GPU memory overhead. Thus, vLLM only captures + decoding requests. Mixed batch (chunked prefill + decoding) or + prefill requests are not captured. + + Since it is used for decoding-only, it assumes there's only 1 token + per sequence in the batch. + """ + assert not self.model_config.enforce_eager + logger.info("Capturing cudagraphs for decoding. This may lead to " + "unexpected consequences if the model is not static. To " + "run the model in eager mode, set 'enforce_eager=True' or " + "use '--enforce-eager' in the CLI. " + "If out-of-memory error occurs during cudagraph capture," + " consider decreasing `gpu_memory_utilization` or " + "switching to eager mode. You can also reduce the " + "`max_num_seqs` as needed to decrease memory usage.") + start_time = time.perf_counter() + start_free_gpu_memory = torch.cuda.mem_get_info()[0] + + # Prepare dummy inputs. These will be reused for all batch sizes. + max_batch_size = self.max_batchsize_to_capture + input_tokens = torch.zeros(max_batch_size, + dtype=torch.long, + device=self.device) + input_positions = torch.zeros(max_batch_size, + dtype=torch.long, + device=self.device) + inputs_embeds = torch.zeros( + (max_batch_size, self.model_config.get_hidden_size()), + dtype=self.model_config.dtype, + device=self.device) + if self.model_config.uses_mrope: + input_positions = torch.tile(input_positions, + (3, 1)).cuda(device=self.device) + # Prepare dummy previous_hidden_states only if needed by the model. + # This is used by draft models such as EAGLE. + previous_hidden_states = None + if "previous_hidden_states" in inspect.signature( + self.model.forward).parameters: + previous_hidden_states = torch.empty( + [max_batch_size, + self.model_config.get_hidden_size()], + dtype=self.model_config.dtype, + device=self.device) + + intermediate_inputs = None + if not get_pp_group().is_first_rank: + intermediate_inputs = self.model.make_empty_intermediate_tensors( + batch_size=max_batch_size, + dtype=self.model_config.dtype, + device=self.device) + + dummy_lora_id: Optional[int] = None + dummy_lora_request: LoRARequest = [] + if self.lora_config: + # The goal is to capture the LoRA kernels in cuda graphs. + # for this purpose, as single dummy lora is sufficient. + dummy_lora_requests = self._add_dummy_loras(num_loras=1) + assert len(dummy_lora_requests) == 1 + dummy_lora_request = dummy_lora_requests[0] + dummy_lora_id = dummy_lora_request.lora_int_id + + with self.attn_state.graph_capture(max_batch_size), graph_capture( + self.device) as graph_capture_context: + # NOTE: Capturing the largest batch size first may help reduce the + # memory usage of CUDA graph. + for virtual_engine in range( + self.parallel_config.pipeline_parallel_size): + # We need to not only iterate over batch sizes, but also whether + # to use inputs_embeds or not, hence we use the cartesian + # product. + cudagraph_capture_sizes = self.vllm_config.compilation_config\ + .cudagraph_capture_sizes + cudagraph_inputs_embeds = (( + True, False) if self.model_config.enable_prompt_embeds else + (False, )) + compilation_cases = itertools.product( + cudagraph_capture_sizes, + cudagraph_inputs_embeds, + ) + # Only rank 0 should print progress bar during capture + if get_tensor_model_parallel_rank() == 0: + compilation_cases = tqdm( + list(compilation_cases), + disable=not self.load_config.use_tqdm_on_load, + desc="Capturing CUDA graph shapes") + for batch_size, use_inputs_embeds in compilation_cases: + attn_metadata = ( + self.attn_state.graph_capture_get_metadata_for_batch( + batch_size, + is_encoder_decoder_model=self.model_config. + is_encoder_decoder)) + # Disable KV Scale Calculation for graph capture + attn_metadata.enable_kv_scales_calculation = False + if self.lora_config: + lora_mapping = LoRAMapping( + **dict(index_mapping=[dummy_lora_id] * batch_size, + prompt_mapping=[dummy_lora_id] * batch_size, + is_prefill=False)) + self.set_active_loras(set([dummy_lora_request]), + lora_mapping) + + graph_runner = CUDAGraphRunner( + self.model, self.attn_backend.get_name(), + self.attn_state.graph_clone(batch_size), + self.model_config.is_encoder_decoder) + + capture_inputs = { + "input_ids": + input_tokens[:batch_size], + "inputs_embeds": + inputs_embeds[:batch_size] + if use_inputs_embeds else None, + "positions": + input_positions[..., :batch_size], + "intermediate_inputs": + intermediate_inputs[:batch_size] + if intermediate_inputs is not None else None, + "kv_caches": + kv_caches[virtual_engine], + "attn_metadata": + attn_metadata, + "memory_pool": + self.graph_memory_pool, + "stream": + graph_capture_context.stream + } + if previous_hidden_states is not None: + capture_inputs[ + "previous_hidden_states"] = previous_hidden_states[: + batch_size] + + if self.has_inner_state: + # Only used by Mamba-based models CUDA graph atm (Jamba) + capture_inputs.update({ + "seqlen_agnostic_capture_inputs": + self.model.get_seqlen_agnostic_capture_inputs( + batch_size) + }) + if self.model_config.is_encoder_decoder: + # add the additional inputs to capture for + # encoder-decoder models. + self._update_inputs_to_capture_for_enc_dec_model( + capture_inputs) + + with set_forward_context(attn_metadata, self.vllm_config, + virtual_engine): + graph_runner.capture(**capture_inputs) + self.graph_memory_pool = graph_runner.graph.pool() + self.graph_runners[virtual_engine][( + batch_size, use_inputs_embeds)] = graph_runner + + if self.lora_config: + self._remove_dummy_loras() + + end_time = time.perf_counter() + end_free_gpu_memory = torch.cuda.mem_get_info()[0] + elapsed_time = end_time - start_time + cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory + # This usually takes < 10 seconds. + logger.info("Graph capturing finished in %.0f secs, took %.2f GiB", + elapsed_time, cuda_graph_size / GiB_bytes) + + def _update_inputs_to_capture_for_enc_dec_model(self, + capture_inputs: Dict[str, + Any]): + """ + Updates the set of input tensors needed for CUDA graph capture in an + encoder-decoder model. + + This method modifies the provided `capture_inputs` dictionary by + adding tensors specific to encoder-decoder specific models that + need to be captured for CUDA Graph replay. + """ + # During the decode phase encoder_input_ids and encoder_positions are + # unset. Do the same thing for graph capture. + capture_inputs["encoder_input_ids"] = torch.tensor([], + dtype=torch.long, + device=self.device) + capture_inputs["encoder_positions"] = torch.tensor([], + dtype=torch.long, + device=self.device) + + @property + def vocab_size(self) -> int: + return self.model_config.get_vocab_size() + + +class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): + """ + GPU model runner with sampling step. + """ + _model_input_cls: Type[ModelInputForGPUWithSamplingMetadata] = ( + ModelInputForGPUWithSamplingMetadata) + _builder_cls: Type[ModelInputForGPUBuilder] = ModelInputForGPUBuilder + + def make_model_input_from_broadcasted_tensor_dict( + self, + tensor_dict: Dict[str, Any], + ) -> ModelInputForGPUWithSamplingMetadata: + model_input = \ + ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict( + tensor_dict, + attn_backend=self.attn_backend, + ) + return model_input + + def prepare_model_input( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + virtual_engine: int = 0, + finished_requests_ids: Optional[List[str]] = None, + ) -> ModelInputForGPUWithSamplingMetadata: + """Prepare the model input based on a given sequence group, including + metadata for the sampling step. + + The API assumes seq_group_metadata_list is sorted by prefill -> decode. + + The result tensors and data structure also batches input in prefill + -> decode order. For example, + + - input_tokens[:num_prefill_tokens] contains prefill tokens. + - input_tokens[num_prefill_tokens:] contains decode tokens. + + If cuda graph is required, this API automatically pads inputs. + """ + model_input = self._prepare_model_input_tensors( + seq_group_metadata_list, finished_requests_ids) + if get_pp_group().is_last_rank: + # Sampling metadata is only required for the final pp group + generators = self.get_generators(finished_requests_ids) + sampling_metadata = SamplingMetadata.prepare( + seq_group_metadata_list, model_input.seq_lens, + model_input.query_lens, self.device, self.pin_memory, + generators, self.sampling_metadata_cache) + else: + sampling_metadata = None + is_prompt = (seq_group_metadata_list[0].is_prompt + if seq_group_metadata_list else None) + return dataclasses.replace(model_input, + sampling_metadata=sampling_metadata, + is_prompt=is_prompt, + virtual_engine=virtual_engine) + + @torch.inference_mode() + def execute_model( + self, + model_input: ModelInputForGPUWithSamplingMetadata, + kv_caches: List[torch.Tensor], + intermediate_tensors: Optional[IntermediateTensors] = None, + num_steps: int = 1, + **kwargs, + ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]: + if num_steps > 1: + raise ValueError("num_steps > 1 is not supported in ModelRunner") + + if self.lora_config: + assert model_input.lora_requests is not None + assert model_input.lora_mapping is not None + self.set_active_loras(model_input.lora_requests, + model_input.lora_mapping) + + self.attn_state.begin_forward(model_input) + + # Currently cuda graph is only supported by the decode phase. + assert model_input.attn_metadata is not None + prefill_meta = model_input.attn_metadata.prefill_metadata + decode_meta = model_input.attn_metadata.decode_metadata + # TODO(andoorve): We can remove this once all + # virtual engines share the same kv cache. + virtual_engine = model_input.virtual_engine + previous_hidden_states = kwargs.get("previous_hidden_states") + if prefill_meta is None and decode_meta.use_cuda_graph: + assert model_input.input_tokens is not None + graph_batch_size = model_input.input_tokens.shape[0] + use_inputs_embeds = model_input.inputs_embeds is not None + model_executable = self.graph_runners[virtual_engine][( + graph_batch_size, use_inputs_embeds)] + if previous_hidden_states is not None: + previous_hidden_states = torch.cat([ + previous_hidden_states, + torch.empty([ + graph_batch_size - previous_hidden_states.shape[0], + *previous_hidden_states.shape[1:] + ], + dtype=previous_hidden_states.dtype, + device=previous_hidden_states.device) + ]) + else: + model_executable = self.model + + # Receive KV cache in distributed KV cache transfer setting + # In disagg prefill setting, it will also recv hidden states and bypass + # model forwarding + # In KV cache database setting, it will change the model input so that + # we can skip prefilling on tokens that successfully received KV caches + # NOTE: The receive operation is blocking + bypass_model_exec = False + if self.need_recv_kv(model_input, kv_caches): + hidden_or_intermediate_states, bypass_model_exec, model_input = \ + get_kv_transfer_group().recv_kv_caches_and_hidden_states( + # model is used to know which layer the current worker + # is working on, so that we can receive KV for only those + # layers. + model_executable, + model_input, + kv_caches=kv_caches + ) + + multi_modal_kwargs = model_input.multi_modal_kwargs or {} + seqlen_agnostic_kwargs = { + "finished_requests_ids": model_input.finished_requests_ids, + "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids, + } if self.has_inner_state else {} + model_kwargs = {} + if previous_hidden_states is not None: + model_kwargs["previous_hidden_states"] = previous_hidden_states + if (self.observability_config is not None + and self.observability_config.collect_model_forward_time): + model_forward_start = torch.cuda.Event(enable_timing=True) + model_forward_end = torch.cuda.Event(enable_timing=True) + model_forward_start.record() + + if not bypass_model_exec: + with set_forward_context(model_input.attn_metadata, + self.vllm_config, virtual_engine): + hidden_or_intermediate_states = model_executable( + input_ids=model_input.input_tokens, + inputs_embeds=model_input.inputs_embeds, + positions=model_input.input_positions, + intermediate_tensors=intermediate_tensors, + **MultiModalKwargs.as_kwargs( + multi_modal_kwargs, + device=self.device, + ), + **seqlen_agnostic_kwargs, + **model_kwargs, + ) + + if (self.observability_config is not None + and self.observability_config.collect_model_forward_time): + model_forward_end.record() + + # Sending KV cache in distributed KV cache transfer setting + # NOTE: the send operation is non-blocking + if self.need_send_kv(model_input, kv_caches): + get_kv_transfer_group().send_kv_caches_and_hidden_states( + # model_executable is used to know which layer the current + # worker is working on, so that we can send KV for only those + # layers. + model_executable, + model_input, + kv_caches, + hidden_or_intermediate_states, + ) + + # Compute the logits in the last pipeline stage. + if not get_pp_group().is_last_rank: + if (self.is_driver_worker + and hidden_or_intermediate_states is not None + and isinstance(hidden_or_intermediate_states, + IntermediateTensors) + and self.observability_config is not None + and self.observability_config.collect_model_forward_time): + model_forward_end.synchronize() + model_forward_time = model_forward_start.elapsed_time( + model_forward_end) + orig_model_forward_time = 0.0 + if intermediate_tensors is not None: + orig_model_forward_time = intermediate_tensors.tensors.get( + "model_forward_time", torch.tensor(0.0)).item() + hidden_or_intermediate_states.tensors["model_forward_time"] = ( + torch.tensor(model_forward_time + orig_model_forward_time)) + return hidden_or_intermediate_states + + logits = self.model.compute_logits(hidden_or_intermediate_states, + model_input.sampling_metadata) + + if self.is_driver_worker: + if model_input.async_callback is not None: + model_input.async_callback() + + # Sample the next token. + assert isinstance(self.sampler, Sampler) + orig_include_gpu_probs = self.sampler.include_gpu_probs_tensor + if model_input.inputs_embeds is not None: + self.sampler.include_gpu_probs_tensor = True + + output: SamplerOutput = self.sampler( + logits=logits, + sampling_metadata=model_input.sampling_metadata, + ) + if (self.observability_config is not None + and self.observability_config.collect_model_forward_time + and output is not None): + model_forward_end.synchronize() + model_forward_time = model_forward_start.elapsed_time( + model_forward_end) + orig_model_forward_time = 0.0 + if intermediate_tensors is not None: + orig_model_forward_time = intermediate_tensors.tensors.get( + "model_forward_time", torch.tensor(0.0)).item() + # If there are multiple workers, we are still tracking the + # latency from the start time of the driver worker to the end + # time of the driver worker. The model forward time will then + # end up covering the communication time as well. + output.model_forward_time = (orig_model_forward_time + + model_forward_time) + + if model_input.inputs_embeds is not None: + if self.is_driver_worker: + sampled_token_ids = [] + valid_outputs = [] + for sequence_group_output in output.outputs: + if len(sequence_group_output.samples) == 0: + continue + assert len(sequence_group_output.samples) == 1 + valid_outputs.append(sequence_group_output) + sampled_token_ids.append( + sequence_group_output.samples[0].output_token) + sampled_token_ids = torch.tensor(sampled_token_ids).to( + self.device) + sampled_token_ids = broadcast_tensor_dict( + {"sampled_token_ids": + sampled_token_ids})["sampled_token_ids"] + else: + sampled_token_ids = broadcast_tensor_dict( + )["sampled_token_ids"] + if len(sampled_token_ids) > 0: + sampled_token_embeds = \ + self.model.get_input_embeddings(sampled_token_ids) + if self.is_driver_worker: + self.sampler.include_gpu_probs_tensor = \ + orig_include_gpu_probs + for i, sequence_group_output in enumerate(valid_outputs): + sequence_group_output.samples[0].output_embed = \ + sampled_token_embeds[i] + + if not self.is_driver_worker: + return [] + + if self.return_hidden_states: + # we only need to pass hidden states of most recent token + assert model_input.sampling_metadata is not None + indices = model_input.sampling_metadata.selected_token_indices + if model_input.is_prompt: + hidden_states = hidden_or_intermediate_states.index_select( + 0, indices) + output.prefill_hidden_states = hidden_or_intermediate_states + elif decode_meta.use_cuda_graph: + hidden_states = hidden_or_intermediate_states[:len(indices)] + else: + hidden_states = hidden_or_intermediate_states + + output.hidden_states = hidden_states + + return [output] + + def need_recv_kv(self, model_input, kv_caches) -> bool: + """Check if we need to receive kv-cache from the other worker. + We need to receive KV when + 1. current vLLM instance is KV cache consumer/decode vLLM instance + 2. this batch is not a profiling run + 3. this batch is a prefill run + + Args: + model_input: input to the model executable + kv_caches: vLLM's paged memory + """ + + if self.vllm_config.kv_transfer_config is None: + return False + + prefill_meta = model_input.attn_metadata.prefill_metadata + + # check if the current run is profiling + is_profile_run = (kv_caches[0].numel() == 0) + # check if the current run is prefill + is_prefill_run = prefill_meta is not None + + return self.vllm_config.kv_transfer_config.is_kv_consumer and ( + not is_profile_run) and is_prefill_run + + def need_send_kv(self, model_input, kv_caches) -> bool: + """Check if we need to send kv-cache to the other worker. + We need to send KV when + 1. current vLLM instance is KV cache producer/prefill vLLM instance + 2. this batch is not a profiling run + 3. this batch is a prefill run + + Args: + model_input: input to the model executable + kv_caches: vLLM's paged memory + """ + + if self.vllm_config.kv_transfer_config is None: + return False + + prefill_meta = model_input.attn_metadata.prefill_metadata + + # check if the current run is profiling + is_profile_run = (kv_caches[0].numel() == 0) + # check if the current run is prefill + is_prefill_run = prefill_meta is not None + + return self.vllm_config.kv_transfer_config.is_kv_producer and ( + not is_profile_run) and is_prefill_run + + +# NOTE: this is nn.Module so the profiler can properly capture/group +# kernels calls made within the graph +class CUDAGraphRunner(nn.Module): + + def __init__(self, model: nn.Module, backend_name: str, + attn_state: AttentionState, is_encoder_decoder_model: bool): + super().__init__() + self.model = model + self.backend_name = backend_name + self.attn_state = attn_state + + self.input_buffers: Dict[str, torch.Tensor] = {} + self.output_buffers: Dict[str, torch.Tensor] = {} + + self._graph: Optional[torch.cuda.CUDAGraph] = None + self._is_encoder_decoder_model = is_encoder_decoder_model + + @property + def graph(self): + assert self._graph is not None + return self._graph + + def capture( + self, + input_ids: torch.Tensor, + inputs_embeds: Optional[torch.Tensor], + positions: torch.Tensor, + intermediate_inputs: Optional[IntermediateTensors], + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + memory_pool: Optional[Tuple[int, int]], + stream: torch.cuda.Stream, + **kwargs, + ): + assert self._graph is None + # Run the model a few times without capturing the graph. + # This is to make sure that the captured graph does not include the + # kernel launches for initial benchmarking (e.g., Triton autotune). + # Note one iteration is not enough for torch.compile + for _ in range(_NUM_WARMUP_ITERS): + self.model( + input_ids=input_ids, + inputs_embeds=inputs_embeds, + positions=positions, + intermediate_tensors=intermediate_inputs, + **kwargs, + ) + # Wait for the warm up operations to finish before proceeding with + # Graph Capture. + torch.cuda.synchronize() + # Capture the graph. + self._graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(self._graph, pool=memory_pool, stream=stream): + output_hidden_or_intermediate_states = self.model( + input_ids=input_ids, + **({ + "inputs_embeds": inputs_embeds, + } if inputs_embeds is not None else {}), + positions=positions, + intermediate_tensors=intermediate_inputs, + **kwargs, + ) + + if isinstance(output_hidden_or_intermediate_states, torch.Tensor): + hidden_or_intermediate_states = weak_ref_tensor( + output_hidden_or_intermediate_states) + elif isinstance(output_hidden_or_intermediate_states, + IntermediateTensors): + hidden_or_intermediate_states = IntermediateTensors( + tensors={ + key: weak_ref_tensor(value) + for key, value in + output_hidden_or_intermediate_states.tensors.items() + }) + + del output_hidden_or_intermediate_states + # make sure `output_hidden_or_intermediate_states` is deleted + # in the graph's memory pool + gc.collect() + torch.cuda.synchronize() + + # Save the input and output buffers. + self.input_buffers = { + "input_ids": + input_ids, + **({ + "inputs_embeds": inputs_embeds, + } if inputs_embeds is not None else {}), + "positions": + positions, + "kv_caches": + kv_caches, + **self.attn_state.get_graph_input_buffers( + attn_metadata, self._is_encoder_decoder_model), + **kwargs, + } + if intermediate_inputs is not None: + self.input_buffers.update(intermediate_inputs.tensors) + if get_pp_group().is_last_rank: + self.output_buffers = { + "hidden_states": hidden_or_intermediate_states + } + else: + self.output_buffers = hidden_or_intermediate_states + + def forward( + self, + input_ids: torch.Tensor, + inputs_embeds: Optional[torch.Tensor], + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors], + **kwargs, + ) -> torch.Tensor: + attn_metadata: AttentionMetadata = get_forward_context().attn_metadata + + # Copy the input tensors to the input buffers. + self.input_buffers["input_ids"].copy_(input_ids, non_blocking=True) + if positions is not None: + # in some case like MLA, it will reuse positions in metadata + # but truncate them to the original size + # so the shape is not padded, we need to copy partial only + self.input_buffers["positions"][:positions.shape[0]].copy_( + positions, non_blocking=True) + if inputs_embeds is not None: + self.input_buffers["inputs_embeds"][:inputs_embeds.shape[0]].copy_( + inputs_embeds, non_blocking=True) + + if self.backend_name != "NO_ATTENTION": + self.input_buffers["slot_mapping"].copy_( + attn_metadata.slot_mapping, non_blocking=True) + + self.attn_state.prepare_graph_input_buffers( + self.input_buffers, attn_metadata, self._is_encoder_decoder_model) + + if "seqlen_agnostic_capture_inputs" in self.input_buffers: + self.model.copy_inputs_before_cuda_graphs(self.input_buffers, + **kwargs) + + if "previous_hidden_states" in self.input_buffers: + self.input_buffers["previous_hidden_states"].copy_( + kwargs["previous_hidden_states"], non_blocking=True) + + if intermediate_tensors is not None: + for key in intermediate_tensors.tensors: + if key != "model_execute_time" and key != "model_forward_time": + self.input_buffers[key].copy_(intermediate_tensors[key], + non_blocking=True) + if self._is_encoder_decoder_model: + self.input_buffers["encoder_input_ids"].copy_( + kwargs['encoder_input_ids'], non_blocking=True) + self.input_buffers["encoder_positions"].copy_( + kwargs['encoder_positions'], non_blocking=True) + + # Run the graph. + self.graph.replay() + # Return the output tensor. + if get_pp_group().is_last_rank: + return self.output_buffers["hidden_states"] + + return self.output_buffers diff --git a/vllm_kunlun/worker/worker.py b/vllm_kunlun/worker/worker.py new file mode 100644 index 0000000..7211fbe --- /dev/null +++ b/vllm_kunlun/worker/worker.py @@ -0,0 +1,50 @@ +"""worker""" +from typing import Dict, List, Optional, Set, Tuple, Type, Union +from vllm.v1.worker.gpu_worker import Worker, _check_if_gpu_supports_dtype, init_worker_distributed_environment +from vllm.model_executor import set_random_seed +from .model_runner import KunlunModelRunner +from vllm.utils import MemorySnapshot +import torch +import os +import gc + +class KunlunWorker(Worker): + """Worker""" + + def init_device(self): + if self.device_config.device.type == "cuda": + # torch.distributed.all_reduce does not free the input tensor until + # the synchronization point. This causes the memory usage to grow + # as the number of all_reduce calls increases. This env var disables + # this behavior. + # Related issue: + # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573 + os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1" + + # This env var set by Ray causes exceptions with graph building. + os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None) + self.device = torch.device(f"cuda:{self.local_rank}") + torch.cuda.set_device(self.device) + + _check_if_gpu_supports_dtype(self.model_config.dtype) + gc.collect() + torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() + self.init_snapshot = MemorySnapshot() + free_memory, total = torch.cuda.mem_get_info() + self.init_gpu_memory = free_memory + # 设置一个合理的初始值,比如总内存的 80% + self.requested_memory = int(total * 0.2) # 留出 20% 的余量 + else: + raise RuntimeError( + f"Not support device type: {self.device_config.device}") + # Initialize the distributed environment. + init_worker_distributed_environment(self.vllm_config, + self.rank, + self.distributed_init_method, + self.local_rank) + # Set random seed. + set_random_seed(self.model_config.seed) + # Construct the model runner + self.model_runner: KunlunModelRunner = KunlunModelRunner( + self.vllm_config, self.device)