init src 0.9.2
This commit is contained in:
85
README.md
85
README.md
@@ -2,87 +2,6 @@
|
||||
|
||||
运行于【海光 DCU】系列算力卡的【文本生成】引擎,基于 vLLM 引擎进行架构特别适配优化,支持 Qwen、DeepSeek、Llama 等最新开源模型。
|
||||
|
||||
因具体模型之间的启动方式和具体镜像会有略微差别,请详细查看 `/enginex` 目录下各个支持模型的启动测试方式。
|
||||
源镜像:harbor.sourcefind.cn:5443/dcu/admin/base/vllm:0.9.2-ubuntu22.04-dtk25.04.2-1226-das1.7-py3.10-20251226
|
||||
|
||||
## 可支持模型列表
|
||||
可在项目文件夹 `/enginex` 下查看具体可支持模型文件的运行方式。
|
||||
|
||||
支持模型列表:
|
||||
- jinaai/jina-embeddings-v3
|
||||
- deepseek-ai/DeepSeek-R1
|
||||
- Qwen/QwQ-32B
|
||||
- deepseek-ai/DeepSeek-V3
|
||||
- deepseek-ai/DeepSeek-V3.1
|
||||
- LLaMA_Fastchat_pytorch
|
||||
- Qwen/Qwen3-30B-A3B
|
||||
- Qwen-7B_fastllm
|
||||
- ChatGLM-6B_fastllm
|
||||
- ZhipuAI/ChatGLM-6B
|
||||
- Shanghai_AI_Laboratory/internlm-chat-7b
|
||||
- ZhipuAI/glm-4v-9b
|
||||
- ZhipuAI/GLM-4-9B-0414
|
||||
- deepseek-ai/DeepSeek-Coder-V2-Base
|
||||
- openai-community/gpt2
|
||||
- ZhipuAI/chatglm2-6b
|
||||
- Qwen/Qwen-7B-Chat
|
||||
- baichuan-inc/Baichuan2-13B-Chat
|
||||
- ZhipuAI/chatglm3-6b
|
||||
- deepseek-ai/DeepSeek-V2
|
||||
- Qwen/Qwen2.5-Omni-7B
|
||||
- deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
|
||||
- deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
|
||||
- deepseek-ai/DeepSeek-R1-Distill-Llama-8B
|
||||
- deepseek-ai/DeepSeek-R1-Distill-Qwen-14B
|
||||
- deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
|
||||
- deepseek-ai/DeepSeek-R1-Distill-Llama-70B
|
||||
- LLM-Research/Meta-Llama-3-8B-Instruct
|
||||
- Qwen/Qwen1.5-14B-Chat
|
||||
- Qwen/Qwen2-7B
|
||||
- Qwen/Qwen3-Embedding-0.6B
|
||||
- baichuan-inc/baichuan-7B
|
||||
- openai-community/gpt2
|
||||
- gaodema/GME-Qwen2-VL
|
||||
- OpenBMB/MiniCPM3-4B
|
||||
- ZhipuAI/glm-10b-chinese
|
||||
- 01ai/Yi-6B-Chat
|
||||
- 01ai/Yi-34B-Chat
|
||||
- ZhipuAI/glm-4-9b-chat
|
||||
- deepseek-ai/DeepSeek-OCR
|
||||
- Qwen/Qwen2.5-Coder-0.5B-Instruct
|
||||
- Qwen/Qwen2.5-Coder-1.5B-Instruct
|
||||
- Qwen/Qwen2.5-Coder-3B-Instruct
|
||||
- Qwen/Qwen2.5-Coder-7B-Instruct
|
||||
- Qwen/Qwen2.5-Coder-14B-Instruct
|
||||
- Qwen/Qwen2.5-Coder-0.5B
|
||||
- Qwen/Qwen2.5-Coder-1.5B
|
||||
- Qwen/Qwen2.5-Coder-3B
|
||||
- Qwen/Qwen2.5-Coder-7B
|
||||
- Qwen/Qwen2.5-Coder-14B
|
||||
- Qwen/Qwen2.5-Coder-32B
|
||||
- deepseek-ai/DeepSeek-V3.2-Exp
|
||||
- ZhipuAI/GLM-4.1V-9B-Thinking
|
||||
- ZhipuAI/GLM-4.1V-9B-Base
|
||||
- Shanghai_AI_Laboratory/internlm2_5-7b
|
||||
- Shanghai_AI_Laboratory/internlm2-chat-20b
|
||||
- Shanghai_AI_Laboratory/internlm2-7b
|
||||
- Shanghai_AI_Laboratory/internlm2_5-20b
|
||||
- TeleAI/telechat-7B
|
||||
- TeleAI/TeleChat-12B-v2
|
||||
- OpenBMB/MiniCPM-2B-dpo-bf16
|
||||
- LLM-Research/Phi-4-multimodal-instruct
|
||||
- LLM-Research/Mistral-7B-Instruct-v0.3
|
||||
- Shanghai_AI_Laboratory/internlm2_5-7b-chat
|
||||
- shakechen/Llama-2-7b-hf
|
||||
- Qwen/Qwen2-Audio-7B-Instruct
|
||||
- AI-ModelScope/gemma-2-2b
|
||||
- AI-ModelScope/falcon-7b-instruct
|
||||
- Duxiaoman-DI/XuanYuan-13B-Chat
|
||||
- ZhipuAI/GLM-4.6
|
||||
- LLM-Research/Codestral-22B-v0.1
|
||||
- facebook/llm-compiler-7b
|
||||
- 01ai/Yi-1.5-6B-Chat
|
||||
- FreedomIntelligence/HuatuoGPT-o1-8B
|
||||
- ZhipuAI/GLM-Z1-32B-0414
|
||||
- Salesforce/Llama-xLAM-2-8b-fc-r
|
||||
- Qwen/Qwen3-235B-A22B
|
||||
- Qwen/Qwen3-Coder-480B-A35B-Instruct
|
||||
版本:0.9.2
|
||||
|
||||
@@ -1,10 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
# 推荐使用docker方式运行,提供拉取的docker镜像:
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10
|
||||
docker run -dit --shm-size 80g --network=host --name=baichuan2 --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /opt/hyhal/:/opt/hyhal/:ro image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10 /bin/bash
|
||||
docker exec -it baichuan2 /bin/bash
|
||||
# 安装docker中没有的依赖:
|
||||
pip install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
|
||||
```
|
||||
@@ -1,9 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.1.0-centos7.6-dtk24.04-py310
|
||||
docker run -dit --network=host --name=chatglm --privileged --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size=16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root --ulimit stack=-1:-1 --ulimit memlock=-1:-1 -v /opt/hyhal/:/opt/hyhal/:ro git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.1.0-centos7.6-dtk24.04-py310 /usr/sbin/init
|
||||
docker exec -it chatglm /bin/bash
|
||||
pip install transformers==4.28.0 -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
|
||||
pip install accelerate sentencepiece mdtex2html gradio rouge_chinese nltk jieba datasets protobuf peft pydantic==1.10.9 -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
|
||||
```
|
||||
@@ -1,9 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
推荐使用docker方式运行,提供拉取的docker镜像:
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/custom:glm-ft-v1.0
|
||||
# 自定义容器名
|
||||
# 当前工程所在路径
|
||||
docker run -it --name= -v :/work -w /work --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=16G --group-add 39 git.modelhub.org.cn:9443/enginex-hygon/custom:glm-ft-v1.0 /bin/bash
|
||||
```
|
||||
@@ -1,12 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
推荐使用docker方式运行,提供拉取的docker镜像:
|
||||
```python
|
||||
# 推荐使用docker方式运行,提供拉取的docker镜像:
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.1.0-centos7.6-dtk24.04-py310
|
||||
# 进入docker,安装docker中没有的依赖:
|
||||
docker run -dit --network=host --name=chatglm --privileged --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size=16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root --ulimit stack=-1:-1 --ulimit memlock=-1:-1 -v /opt/hyhal/:/opt/hyhal/:ro image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk24.04-py310 /usr/sbin/init
|
||||
docker exec -it chatglm /bin/bash
|
||||
pip install transformers==4.28.0 -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
|
||||
pip install accelerate sentencepiece mdtex2html gradio rouge_chinese nltk jieba datasets protobuf peft pydantic==1.10.9 -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
|
||||
```
|
||||
@@ -1,14 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
推荐使用docker方式运行,提供拉取的docker镜像:
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10
|
||||
```
|
||||
进入docker,安装docker中没有的依赖:
|
||||
```python
|
||||
docker run -dit --network=host --name=chatglm3 --privileged --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size=16G -v /opt/hyhal/:/opt/hyhal/:ro --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root --ulimit stack=-1:-1 --ulimit memlock=-1:-1 image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10
|
||||
docker exec -it chatglm3 /bin/bash
|
||||
pip install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
|
||||
cd finetune_demo
|
||||
pip install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
|
||||
```
|
||||
@@ -1,11 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.1.0-centos7.6-dtk24.04-py310
|
||||
docker run -it -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro --shm-size=80G --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name docker_name imageID bash
|
||||
|
||||
cd /your_code_path/codestral_pytorch
|
||||
pip install -r requirements.txt
|
||||
pip install -U huggingface_hub hf_transfer
|
||||
export HF_ENDPOINT=https://hf-mirror.com
|
||||
```
|
||||
@@ -1,11 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.1.0-centos7.6-dtk24.04-py310
|
||||
docker run -it -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro --shm-size=80G --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name docker_name imageID bash
|
||||
|
||||
cd /your_code_path/deepseek-coder-v2_pytorch
|
||||
pip install -r requirements.txt
|
||||
pip install -U huggingface_hub hf_transfer
|
||||
export HF_ENDPOINT=https://hf-mirror.com
|
||||
```
|
||||
@@ -1,9 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/vllm:0.8.5-ubuntu22.04-dtk25.04.1-rc5-das1.6-py3.10-20250724
|
||||
docker run -it --shm-size 200g --network=host --name {docker_name} --privileged --device=/dev/kfd --device=/dev/dri --device=/dev/mkfd --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro {imageID} bash
|
||||
|
||||
cd /your_code_path/deepseek-ocr_pytorch
|
||||
|
||||
```
|
||||
@@ -1,10 +0,0 @@
|
||||
# 运行方式
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.3.0-ubuntu22.04-dtk24.04.3-py3.10
|
||||
|
||||
docker run --shm-size 500g --network=host --name=dpskv3 --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v 项目地址(绝对路径):/home/ -v /opt/hyhal:/opt/hyhal:ro -it <your IMAGE ID> bash
|
||||
|
||||
pip install https://download.sourcefind.cn:65024/directlink/4/lmslim/DAS1.3/lmslim-0.1.2+das.dtk24043-cp310-cp310-manylinux_2_28_x86_64.whl
|
||||
|
||||
pip install https://download.sourcefind.cn:65024/directlink/4/vllm/DAS1.3/vllm-0.6.2+das.opt1.dtk24043-cp310-cp310-manylinux_2_28_x86_64.whl
|
||||
```
|
||||
@@ -1,9 +0,0 @@
|
||||
# 运行方式
|
||||
```python
|
||||
docker git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.3.0-ubuntu22.04-dtk24.04.3-py3.10
|
||||
|
||||
docker run --shm-size 500g --network=host --name=dpskr1 --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v 项目地址(绝对路径):/home/ -v /opt/hyhal:/opt/hyhal:ro -it <your IMAGE ID> bash
|
||||
|
||||
cd inference
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
@@ -1,9 +0,0 @@
|
||||
# 运行方式
|
||||
```python
|
||||
docker git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.3.0-py3.10-dtk24.04.3-ubuntu20.04
|
||||
|
||||
docker run --shm-size 500g --network=host --name=dpskr1 --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v 项目地址(绝对路径):/home/ -v /opt/hyhal:/opt/hyhal:ro -it <your IMAGE ID> bash
|
||||
|
||||
cd inference
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
@@ -1,8 +0,0 @@
|
||||
# 运行方式
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.1.0-centos7.6-dtk24.04-py310
|
||||
docker run -it -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro --shm-size=80G --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name docker_name imageID bash
|
||||
|
||||
cd /your_code_path/deepseek-v2_pytorch
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
@@ -1,8 +0,0 @@
|
||||
# 运行方式
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/vllm:0.9.2-ubuntu22.04-dtk25.04.1-rc5-rocblas101839-0811-das1.6-py3.10-20250812-beta
|
||||
|
||||
docker run -it --name {docker_name} --device=/dev/kfd --privileged --network=host --device=/dev/dri --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v /your_code_path:/your_code_path -v /opt/hyhal:/opt/hyhal:ro -v /module/DeepSeek-V3.1:/your_model_path/DeepSeek-V3.1 --group-add video --shm-size 64G {imageID} bash
|
||||
|
||||
cd /your_code_path/deepseek-v3.1_vllm
|
||||
```
|
||||
@@ -1,8 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/vllm:0.9.2-ubuntu22.04-dtk25.04.1-rc5-rocblas104381-0915-das1.6-py3.10-20250916-rc2-ds3.2
|
||||
docker run -it --shm-size 200g --network=host --name {docker_name} --privileged --device=/dev/kfd --device=/dev/dri --device=/dev/mkfd --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro {imageID} bash
|
||||
|
||||
cd /your_code_path/deepseek-v3.2-exp_vllm
|
||||
```
|
||||
@@ -1,9 +0,0 @@
|
||||
# 运行方式
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.3.0-ubuntu22.04-dtk24.04.3-py3.10
|
||||
|
||||
docker run --shm-size 500g --network=host --name=dpskv3 --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v 项目地址(绝对路径):/home/ -v /opt/hyhal:/opt/hyhal:ro -it <your IMAGE ID> bash
|
||||
|
||||
cd inference
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
@@ -1,12 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.1.0-ubuntu20.04-dtk24.04.2-py3.10
|
||||
docker run -it -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro --shm-size=64G --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name glm-4v bash
|
||||
|
||||
cd /path/your_code_data/
|
||||
|
||||
pip install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
|
||||
#开发者社区下载bitsandbytes
|
||||
pip install bitsandbytes-0.42.0+das1.1.gitce85679.abi1.dtk2404.torch2.1.0-py3-none-any.whl
|
||||
```
|
||||
@@ -1,9 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/vllm:0.8.5-ubuntu22.04-dtk25.04.1-rc5-das1.6-py3.10-20250711
|
||||
docker run -it --shm-size 200g --network=host --name {docker_name} --privileged --device=/dev/kfd --device=/dev/dri --device=/dev/mkfd --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro {imageID} bash
|
||||
|
||||
cd /your_code_path/glm-4.1v_pytorch
|
||||
pip install transformers==4.53.2
|
||||
```
|
||||
@@ -1,8 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/vllm:0.9.2-ubuntu22.04-dtk25.04.1-rc5-rocblas104381-0915-das1.6-py3.10-20250916-rc2
|
||||
docker run -it --shm-size 200g --network=host --name {docker_name} --privileged --device=/dev/kfd --device=/dev/dri --device=/dev/mkfd --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro {imageID} bash
|
||||
|
||||
cd /your_code_path/glm-4.6_vllm
|
||||
```
|
||||
@@ -1,9 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.4.1-ubuntu22.04-dtk25.04-py3.10
|
||||
docker run -it --shm-size 200g --network=host --name {docker_name} --privileged --device=/dev/kfd --device=/dev/dri --device=/dev/mkfd --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro {imageID} bash
|
||||
|
||||
cd /your_code_path/glm-z1_pytorch
|
||||
pip install transformers>=4.51.3
|
||||
```
|
||||
@@ -1,8 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.4.1-ubuntu22.04-dtk25.04-py3.10
|
||||
docker run --shm-size 100g --network=host --name=gme --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v 项目地址(绝对路径):/home/ -v /opt/hyhal:/opt/hyhal:ro -it bash
|
||||
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
@@ -1,15 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.3.0-ubuntu22.04-dtk24.04.3-py3.10
|
||||
|
||||
docker run --shm-size 50g --network=host --name=huatuo --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v 项目地址(绝对路径):/home/ -v /opt/hyhal:/opt/hyhal:ro -it bash
|
||||
|
||||
pip install -r requirements.txt
|
||||
|
||||
pip uninstall vllm
|
||||
|
||||
pip install https://download.sourcefind.cn:65024/directlink/4/lmslim/DAS1.3/lmslim-0.1.2+das.dtk24043-cp310-cp310-manylinux_2_28_x86_64.whl
|
||||
|
||||
pip install https://download.sourcefind.cn:65024/directlink/4/vllm/DAS1.3/vllm-0.6.2+das.opt1.dtk24043-cp310-cp310-manylinux_2_28_x86_64.whl
|
||||
```
|
||||
@@ -1,10 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.3.0-ubuntu22.04-dtk24.04.3-py3.10
|
||||
# <img id="">用上面拉取docker镜像的ID替换
|
||||
# 主机端路径
|
||||
# 容器映射路径
|
||||
# 若要在主机端和容器端映射端口需要删除--network host参数
|
||||
docker run -it --name internlm_vllm --privileged --shm-size=64G --device=/dev/kfd --device=/dev/dri/ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --ulimit memlock=-1:-1 --ipc=host --network host --group-add video -v /opt/hyhal:/opt/hyhal -v : <img id=""> /bin/bash
|
||||
```
|
||||
@@ -1,16 +0,0 @@
|
||||
# 运行方式
|
||||
```python
|
||||
拉取镜像:
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10
|
||||
创建并启动容器:
|
||||
docker run --shm-size 64g --network=host --name=llama_fastchat --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v /opt/hyhal:/opt/hyhal:ro -v : -it bash
|
||||
|
||||
cp -r mpirun/* ./
|
||||
cd FastChat-main
|
||||
pip3 install -e .
|
||||
cd ../transformers-main
|
||||
pip3 install -e .
|
||||
pip3 uninstall wandb
|
||||
pip3 install mpi4py
|
||||
cd ..
|
||||
```
|
||||
@@ -1,7 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.1.0-ubuntu20.04-dtk24.04.2-py3.10
|
||||
|
||||
docker run -it -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal:/opt/hyhal:ro --shm-size=32G --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name docker_name imageID bash
|
||||
```
|
||||
@@ -1,11 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.4.1-ubuntu22.04-dtk25.04-py3.10
|
||||
|
||||
docker run --shm-size 100g --network=host --name=wan --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v 项目地址(绝对路径):/home/ -v /opt/hyhal:/opt/hyhal:ro -it bash
|
||||
|
||||
pip install -r requirements.txt
|
||||
|
||||
pip install -e .
|
||||
```
|
||||
@@ -1,10 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
# 推荐使用docker方式运行,提供拉取的docker镜像:
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.1.0-centos7.6-dtk24.04-py310
|
||||
docker run -it -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro --shm-size=80G --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name docker_name imageID bash
|
||||
|
||||
cd /your_code_path/llama3_pytorch
|
||||
pip install -e .
|
||||
```
|
||||
@@ -1,9 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10
|
||||
# 为以上拉取的docker的镜像ID替换
|
||||
docker run -it --shm-size=32G -v $PWD/MiniCPM:/home/MiniCPM -v /opt/hyhal:/opt/hyhal --network=host --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name minicpm bash
|
||||
cd /home/MiniCPM
|
||||
pip install -r finetune/requirements.txt # finetune/requirements.txt
|
||||
```
|
||||
@@ -1,9 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10
|
||||
# 为以上拉取的docker的镜像ID替换
|
||||
docker run -it --shm-size=32G -v $PWD/MiniCPM:/home/MiniCPM -v /opt/hyhal:/opt/hyhal --network=host --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name minicpm bash
|
||||
cd /home/MiniCPM
|
||||
pip install -r finetune/requirements.txt # finetune/requirements.txt
|
||||
```
|
||||
@@ -1,8 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/custom:vllm0.8.5-ubuntu22.04-dtk25.04-rc7-das1.5-py3.10-20250521-fixpy-rocblas0521-beta2
|
||||
docker run -it --shm-size 200g --network=host --name {docker_name} --privileged --device=/dev/kfd --device=/dev/dri --device=/dev/mkfd --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro {imageID} bash
|
||||
|
||||
cd /your_code_path/mistral_pytorch
|
||||
```
|
||||
@@ -1,12 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.3.0-py3.10-dtk24.04.3-ubuntu20.04
|
||||
docker run -it --shm-size=1024G -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal:/opt/hyhal --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name phi-4 bash # 为以上拉取的docker的镜像ID替换
|
||||
|
||||
git clone http://developer.sourcefind.cn/codes/modelzoo/phi-4-multimodal-instruct_pytorch.git
|
||||
|
||||
cd /path/your_code_data/
|
||||
|
||||
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
```
|
||||
@@ -1,9 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.4.1-ubuntu22.04-dtk25.04-py3.10
|
||||
# <your IMAGE ID>为以上拉取的docker的镜像ID替换,本镜像为:dee41741fb40
|
||||
docker run -it --shm-size=64G --network host -v $PWD/QwQ-32B:/home/QwQ-32B -v /opt/hyhal:/opt/hyhal:ro --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name qwq <your IMAGE ID> bash
|
||||
cd /home/QwQ-32B
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
@@ -1,9 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
# 推荐使用docker方式运行,提供拉取的docker镜像:
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/git.modelhub.org.cn:9443/enginex-hygon/pytorch:1.13.1-centos7.6-dtk-23.04-py38-latest
|
||||
# 自定义容器名
|
||||
# 当前工程所在路径
|
||||
docker run -it --name= -v :/work --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=16G --group-add 39 image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk-23.04-py38-latest /bin/bash
|
||||
```
|
||||
@@ -1,8 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:1.13.1-centos7.6-dtk-23.04-py38-latest
|
||||
# 自定义容器名
|
||||
# 当前工程所在路径
|
||||
docker run -it --name= -v :/work --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=16G --group-add 39 git.modelhub.org.cn:9443/enginex-hygon/pytorch:1.13.1-centos7.6-dtk-23.04-py38-latest /bin/bash
|
||||
```
|
||||
@@ -1,9 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
# 推荐使用docker方式运行,提供拉取的docker镜像:
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10
|
||||
docker run -it --shm-size=1024G -v $PWD/qwen1.5-pytorch:/home/Qwen1.5-pytorch -v /opt/hyhal:/opt/hyhal --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name Qwen1.5-pytorch bash # 为以上拉取的docker的镜像ID替换,本镜像为:ffa1f63239fc
|
||||
cd /home/Qwen1.5-pytorch
|
||||
pip install -r requirement.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
|
||||
```
|
||||
@@ -1,11 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10
|
||||
docker run -it --shm-size=1024G -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal:/opt/hyhal --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name qwen2_72B_pytorch bash # 为以上拉取的docker的镜像ID替换,本镜像为:a4dd5be0ca23
|
||||
pip install https://cancon.hpccube.com:65024/directlink/4/vllm/DAS1.1.1/vllm-0.5.0+das.opt1.3e2c63a.dtk2404.torch2.1.0-cp310-cp310-linux_x86_64.whl
|
||||
cd /path/your_code_data/
|
||||
cd LLaMA-Factory
|
||||
pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
|
||||
pip install e . -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
|
||||
```
|
||||
@@ -1,15 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.3.0-ubuntu22.04-dtk24.04.3-py3.10
|
||||
|
||||
docker run -it -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro --shm-size=64G --privileged=true --network=host --device=/dev/kfd --device=/dev/dri/ --group-add video --name qwen2-audio bash
|
||||
|
||||
cd /path/your_code_data/Qwen2-Audio/demo
|
||||
|
||||
pip install -r requirements_web_demo.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
|
||||
|
||||
pip install git+https://github.com/modelscope/swift.git#egg=ms-swift[llm]
|
||||
|
||||
pip install git+https://github.com/huggingface/transformers.git
|
||||
```
|
||||
@@ -1,13 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.4.1-ubuntu22.04-dtk25.04-py3.10-fixpy
|
||||
# 为以上拉取的docker的镜像ID替换,本镜像为:e77c15729879
|
||||
docker run -it --shm-size=64G -v $PWD/Qwen2.5-Omni:/home/Qwen2.5-Omni -v /opt/hyhal:/opt/hyhal:ro --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name qomni bash
|
||||
cd /home/Qwen2.5-Omni
|
||||
pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple
|
||||
|
||||
unzip f742a644ca32e65758c3adb36225aef1731bd2a8.zip
|
||||
cd transformers-f742a644ca32e65758c3adb36225aef1731bd2a8
|
||||
pip install -e . # 作者限定只能使用transformers==4.50.0.dev0
|
||||
```
|
||||
@@ -1,9 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/vllm:0.8.5-ubuntu22.04-dtk25.04.1-rc5-das1.6-py3.10-20250724
|
||||
|
||||
docker run -it --name {docker_name} --device=/dev/kfd --privileged --network=host --device=/dev/dri --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v /public/LLM-Models:/home/LLM-Models:ro -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal:/opt/hyhal:ro --group-add video --shm-size 64G {imageID} bash
|
||||
|
||||
cd /your_code_path/qwen3-30b-a3b_vllm
|
||||
```
|
||||
@@ -1,10 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/custom:vllm0.8.5-ubuntu22.04-dtk25.04-rc7-das1.5-py3.10-20250521-fixpy-rocblas0521-beta2
|
||||
docker run -it --shm-size 200g --network=host --name {docker_name} --privileged --device=/dev/kfd --device=/dev/dri --device=/dev/mkfd --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro {imageID} bash
|
||||
|
||||
cd /your_code_path/qwen3-embedding_pytorch
|
||||
pip install transformers>=4.51.0
|
||||
pip install sentence-transformers>=2.7.0
|
||||
```
|
||||
@@ -1,10 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/custom:vllm0.8.4-ubuntu22.04-dtk25.04-rc7-das1.5-py3.10-20250429-dev-qwen3-only
|
||||
# docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.4.1-ubuntu22.04-dtk25.04-py3.10-fixpy
|
||||
# 为以上拉取的docker的镜像ID替换,本镜像为:6e12a1c4ae4d
|
||||
docker run -it --shm-size=64G -v $PWD/Qwen3:/home/Qwen3 -v /opt/hyhal:/opt/hyhal:ro --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name qwen3 bash
|
||||
cd /home/Qwen3
|
||||
pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple
|
||||
```
|
||||
@@ -1,9 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/custom:vllm0.8.4-ubuntu22.04-dtk25.04-rc7-das1.5-py3.10-20250429-dev-qwen3-only
|
||||
# <your IMAGE ID>为以上拉取的docker的镜像ID替换
|
||||
docker run -it --shm-size=64G -v $PWD/Qwen3:/home/Qwen3 -v /opt/hyhal:/opt/hyhal:ro --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name qwen3 <your IMAGE ID> bash
|
||||
cd /home/Qwen3
|
||||
pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple
|
||||
```
|
||||
@@ -1,15 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10
|
||||
|
||||
创建并启动容器:
|
||||
docker run --shm-size 80g --network=host --name=telechat --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v /opt/hyhal:/opt/hyhal:ro -v : -it bash
|
||||
|
||||
安装依赖:
|
||||
cd TeleChat
|
||||
pip install -r requirements.txt -i https://pypi.mirrors.ustc.edu.cn/simple/
|
||||
pip install 'ms-swift[llm]' -U -i https://pypi.mirrors.ustc.edu.cn/simple/
|
||||
pip install optimum -i https://pypi.mirrors.ustc.edu.cn/simple/
|
||||
pip install auto-gptq -i https://pypi.mirrors.ustc.edu.cn/simple/
|
||||
```
|
||||
@@ -1,17 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10
|
||||
|
||||
docker run -it -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro --shm-size=64G --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name xuanyuan bash
|
||||
|
||||
cd /path/your_code_data/
|
||||
|
||||
pip install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
|
||||
|
||||
git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
|
||||
|
||||
cd LLaMA-Factory
|
||||
|
||||
pip install -e ".[torch,metrics]"
|
||||
```
|
||||
@@ -1,10 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.1.0-py3.10-dtk24.04.3-ubuntu20.04
|
||||
docker run -it --shm-size=1024G -v : -v /opt/hyhal:/opt/hyhal --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name Yi-1.5 bash # 为以上拉取的docker的镜像ID替换
|
||||
cd /home/Yi-1.5-pytorch
|
||||
pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
|
||||
|
||||
pip uninstall vllm
|
||||
```
|
||||
@@ -1,9 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10
|
||||
# <img id="">用上面拉取docker镜像的ID替换
|
||||
# 主机端路径
|
||||
# 容器映射路径
|
||||
docker run -it --name yi --shm-size=64G --device=/dev/kfd --device=/dev/dri/ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v /opt/hyhal:/opt/hyhal:ro --ulimit memlock=-1:-1 --ipc=host --network=host --group-add video -v : <img id=""> /bin/bash
|
||||
```
|
||||
@@ -1,9 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10
|
||||
# <img id="">用上面拉取docker镜像的ID替换
|
||||
# 主机端路径
|
||||
# 容器映射路径
|
||||
docker run -it --name yi --shm-size=64G --device=/dev/kfd --device=/dev/dri/ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v /opt/hyhal:/opt/hyhal:ro --ulimit memlock=-1:-1 --ipc=host --network=host --group-add video -v : <img id=""> /bin/bash
|
||||
```
|
||||
@@ -1,7 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10
|
||||
docker run -dit --network=host --name=baichuan -v /opt/hyhal:/opt/hyhal:ro --privileged --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size=16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root --ulimit stack=-1:-1 --ulimit memlock=-1:-1 image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk-23.04-py38-latest /bin/bash
|
||||
docker exec -it baichuan /bin/bash
|
||||
```
|
||||
@@ -1,8 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10
|
||||
docker exec -it chatglm /bin/bash
|
||||
pip install transformers==4.28.0 -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
|
||||
pip install accelerate sentencepiece mdtex2html gradio rouge_chinese nltk jieba datasets==2.20.0 protobuf peft==0.5.0 pydantic==1.10.9 -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
|
||||
```
|
||||
@@ -1,9 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.1.0-ubuntu20.04-dtk24.04.2-py3.10
|
||||
docker run -it -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro --shm-size=80G --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name docker_name imageID bash
|
||||
|
||||
cd /your_code_path/falcon_pytorch
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
@@ -1,9 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.1.0-ubuntu20.04-dtk24.04.2-py3.10
|
||||
docker run -it -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro --shm-size=80G --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name docker_name imageID bash
|
||||
|
||||
cd /your_code_path/gemma2_pytorch
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
@@ -1,9 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10
|
||||
# 为以上拉取的docker的镜像ID替换
|
||||
docker run -it --shm-size=32G -v $PWD/MiniCPM:/home/MiniCPM -v /opt/hyhal:/opt/hyhal --network=host --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name minicpm bash
|
||||
cd /home/MiniCPM
|
||||
pip install -r finetune/requirements.txt # finetune/requirements.txt
|
||||
```
|
||||
@@ -1,10 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
dcoker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.4.1-ubuntu22.04-dtk25.04-py3.10
|
||||
docker run -it --shm-size 200g --network=host --name {docker_name} --privileged --device=/dev/kfd --device=/dev/dri --device=/dev/mkfd --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro {imageID} bash
|
||||
|
||||
cd /your_code_path/glm-4_pytorch
|
||||
pip install -r inference/requirements.txt
|
||||
pip install -r finetune/requirements.txt
|
||||
```
|
||||
@@ -1,13 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10
|
||||
|
||||
docker run -it -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro --shm-size=64G --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name glm-4v bash
|
||||
|
||||
cd /path/your_code_data/
|
||||
|
||||
pip install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
|
||||
#开发者社区下载bitsandbytes
|
||||
pip install bitsandbytes-0.42.0+das1.1.gitce85679.abi1.dtk2404.torch2.1.0-py3-none-any.whl
|
||||
```
|
||||
@@ -1,8 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:1.10.0-centos7.6-dtk-23.04-py37-latest
|
||||
docker run -dit --network=host --name=gpt2_pytorch --privileged --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size=16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root --ulimit stack=-1:-1 --ulimit memlock=-1:-1 image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.10.0-centos7.6-dtk-23.04-py37-latest
|
||||
docker exec -it gpt2_pytorch /bin/bash
|
||||
pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
|
||||
```
|
||||
@@ -1,9 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10(推荐)
|
||||
# <img id="">用上面拉取docker镜像的ID替换
|
||||
# 主机端路径
|
||||
# 容器映射路径
|
||||
docker run -it --name baichuan --shm-size=1024G --device=/dev/kfd --device=/dev/dri/ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v /opt/hyhal:/opt/hyhal:ro --ulimit memlock=-1:-1 --ipc=host --network host --group-add video -v : <img id=""> /bin/bash
|
||||
```
|
||||
@@ -1,8 +0,0 @@
|
||||
# 运行方式
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/custom:vllm0.8.5-ubuntu22.04-dtk25.04-rc7-das1.5-py3.10-20250612-fixpy-rocblas0611-rc2
|
||||
|
||||
docker run -it --shm-size 200g --network=host --name {docker_name} --privileged --device=/dev/kfd --device=/dev/dri --device=/dev/mkfd --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro {imageID} bash
|
||||
|
||||
cd /your_code_path/jina-embeddings-v3_vllm
|
||||
```
|
||||
@@ -1,11 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.1.0-centos7.6-dtk24.04-py310
|
||||
docker run -it -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro --shm-size=80G --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name docker_name imageID bash
|
||||
|
||||
cd /your_code_path/llm-compiler_pytorch
|
||||
pip install -r requirements.txt
|
||||
pip install -U huggingface_hub hf_transfer
|
||||
export HF_ENDPOINT=https://hf-mirror.com
|
||||
```
|
||||
@@ -1,13 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.4.1-ubuntu22.04-dtk25.04.1-py3.10
|
||||
docker run -it --name {name} --shm-size=1024G --device=/dev/kfd --device=/dev/dri/ --privileged --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --ulimit memlock=-1:-1 --ipc=host --network host --group-add video -v /opt/hyhal:/opt/hyhal:ro -v {}:{} {docker_image} /bin/bash
|
||||
# 修改1 {name} 需要改为自定义名称
|
||||
# 修改2 {docker_image} 需要需要创建容器的对应镜像名称
|
||||
# 修改3 -v 挂载路径到容器指定路径
|
||||
pip install -r requirements.txt
|
||||
cd LLaMA-Factory
|
||||
pip install -e ".[torch,metrics]"
|
||||
pip install deepspeed-0.14.2+das.opt1.dtk25041-cp310-cp310-manylinux_2_28_x86_64.whl
|
||||
```
|
||||
@@ -1,15 +0,0 @@
|
||||
# 运行方式
|
||||
|
||||
```python
|
||||
docker pull git.modelhub.org.cn:9443/enginex-hygon/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10
|
||||
|
||||
创建并启动容器:
|
||||
docker run --shm-size 80g --network=host --name=telechat --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v /opt/hyhal:/opt/hyhal:ro -v : -it bash
|
||||
|
||||
安装依赖:
|
||||
cd TeleChat
|
||||
pip install -r requirements.txt -i https://pypi.mirrors.ustc.edu.cn/simple/
|
||||
pip install 'ms-swift[llm]' -U -i https://pypi.mirrors.ustc.edu.cn/simple/
|
||||
pip install optimum -i https://pypi.mirrors.ustc.edu.cn/simple/
|
||||
pip install auto-gptq -i https://pypi.mirrors.ustc.edu.cn/simple/
|
||||
```
|
||||
BIN
vllm/_C.abi3.so
Executable file
BIN
vllm/_C.abi3.so
Executable file
Binary file not shown.
96
vllm/__init__.py
Normal file
96
vllm/__init__.py
Normal file
@@ -0,0 +1,96 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
|
||||
|
||||
# The version.py should be independent library, and we always import the
|
||||
# version library first. Such assumption is critical for some customization.
|
||||
from .version import __version__, __version_tuple__ # isort:skip
|
||||
|
||||
import typing
|
||||
|
||||
# The environment variables override should be imported before any other
|
||||
# modules to ensure that the environment variables are set before any
|
||||
# other modules are imported.
|
||||
import vllm.env_override # noqa: F401
|
||||
|
||||
MODULE_ATTRS = {
|
||||
"AsyncEngineArgs": ".engine.arg_utils:AsyncEngineArgs",
|
||||
"EngineArgs": ".engine.arg_utils:EngineArgs",
|
||||
"AsyncLLMEngine": ".engine.async_llm_engine:AsyncLLMEngine",
|
||||
"LLMEngine": ".engine.llm_engine:LLMEngine",
|
||||
"LLM": ".entrypoints.llm:LLM",
|
||||
"initialize_ray_cluster": ".executor.ray_utils:initialize_ray_cluster",
|
||||
"PromptType": ".inputs:PromptType",
|
||||
"TextPrompt": ".inputs:TextPrompt",
|
||||
"TokensPrompt": ".inputs:TokensPrompt",
|
||||
"ModelRegistry": ".model_executor.models:ModelRegistry",
|
||||
"SamplingParams": ".sampling_params:SamplingParams",
|
||||
"PoolingParams": ".pooling_params:PoolingParams",
|
||||
"ClassificationOutput": ".outputs:ClassificationOutput",
|
||||
"ClassificationRequestOutput": ".outputs:ClassificationRequestOutput",
|
||||
"CompletionOutput": ".outputs:CompletionOutput",
|
||||
"EmbeddingOutput": ".outputs:EmbeddingOutput",
|
||||
"EmbeddingRequestOutput": ".outputs:EmbeddingRequestOutput",
|
||||
"PoolingOutput": ".outputs:PoolingOutput",
|
||||
"PoolingRequestOutput": ".outputs:PoolingRequestOutput",
|
||||
"RequestOutput": ".outputs:RequestOutput",
|
||||
"ScoringOutput": ".outputs:ScoringOutput",
|
||||
"ScoringRequestOutput": ".outputs:ScoringRequestOutput",
|
||||
}
|
||||
|
||||
if typing.TYPE_CHECKING:
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
from vllm.engine.llm_engine import LLMEngine
|
||||
from vllm.entrypoints.llm import LLM
|
||||
from vllm.executor.ray_utils import initialize_ray_cluster
|
||||
from vllm.inputs import PromptType, TextPrompt, TokensPrompt
|
||||
from vllm.model_executor.models import ModelRegistry
|
||||
from vllm.outputs import (ClassificationOutput,
|
||||
ClassificationRequestOutput, CompletionOutput,
|
||||
EmbeddingOutput, EmbeddingRequestOutput,
|
||||
PoolingOutput, PoolingRequestOutput,
|
||||
RequestOutput, ScoringOutput,
|
||||
ScoringRequestOutput)
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.sampling_params import SamplingParams
|
||||
else:
|
||||
|
||||
def __getattr__(name: str) -> typing.Any:
|
||||
from importlib import import_module
|
||||
|
||||
if name in MODULE_ATTRS:
|
||||
module_name, attr_name = MODULE_ATTRS[name].split(":")
|
||||
module = import_module(module_name, __package__)
|
||||
return getattr(module, attr_name)
|
||||
else:
|
||||
raise AttributeError(
|
||||
f'module {__package__} has no attribute {name}')
|
||||
|
||||
|
||||
__all__ = [
|
||||
"__version__",
|
||||
"__version_tuple__",
|
||||
"LLM",
|
||||
"ModelRegistry",
|
||||
"PromptType",
|
||||
"TextPrompt",
|
||||
"TokensPrompt",
|
||||
"SamplingParams",
|
||||
"RequestOutput",
|
||||
"CompletionOutput",
|
||||
"PoolingOutput",
|
||||
"PoolingRequestOutput",
|
||||
"EmbeddingOutput",
|
||||
"EmbeddingRequestOutput",
|
||||
"ClassificationOutput",
|
||||
"ClassificationRequestOutput",
|
||||
"ScoringOutput",
|
||||
"ScoringRequestOutput",
|
||||
"LLMEngine",
|
||||
"EngineArgs",
|
||||
"AsyncLLMEngine",
|
||||
"AsyncEngineArgs",
|
||||
"initialize_ray_cluster",
|
||||
"PoolingParams",
|
||||
]
|
||||
2455
vllm/_custom_ops.py
Normal file
2455
vllm/_custom_ops.py
Normal file
File diff suppressed because it is too large
Load Diff
350
vllm/_ipex_ops.py
Normal file
350
vllm/_ipex_ops.py
Normal file
@@ -0,0 +1,350 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
try:
|
||||
import intel_extension_for_pytorch as ipex
|
||||
except ImportError as e:
|
||||
logger.warning("Import error msg: %s", e.msg)
|
||||
|
||||
|
||||
class ipex_ops:
|
||||
|
||||
@staticmethod
|
||||
def _reshape_activation_tensor(
|
||||
x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
num = x.size(0)
|
||||
d = x.size(1) // 2
|
||||
x = x.reshape(num, 2, d)
|
||||
x1, x2 = torch.chunk(x, chunks=2, dim=1)
|
||||
x1 = x1.reshape(num, d)
|
||||
x2 = x2.reshape(num, d)
|
||||
return x1, x2
|
||||
|
||||
@staticmethod
|
||||
def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
|
||||
ipex.llm.functional.silu_and_mul(x, out)
|
||||
|
||||
@staticmethod
|
||||
def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
|
||||
ipex.llm.functional.gelu_and_mul(x, out)
|
||||
|
||||
@staticmethod
|
||||
def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
|
||||
ipex.llm.functional.gelu_and_mul(x, out)
|
||||
|
||||
@staticmethod
|
||||
def gelu_fast(x: torch.Tensor) -> torch.Tensor:
|
||||
return torch.nn.functional.gelu(x)
|
||||
|
||||
@staticmethod
|
||||
def gelu_new(x: torch.Tensor) -> torch.Tensor:
|
||||
return torch.nn.functional.gelu(x)
|
||||
|
||||
@staticmethod
|
||||
def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
|
||||
ipex.llm.functional.gelu_quick(x, out)
|
||||
|
||||
@staticmethod
|
||||
def paged_attention_v1(
|
||||
out: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key_cache: torch.Tensor,
|
||||
value_cache: torch.Tensor,
|
||||
num_kv_heads: int,
|
||||
scale: float,
|
||||
block_tables: torch.Tensor,
|
||||
context_lens: torch.Tensor,
|
||||
block_size: int,
|
||||
max_context_len: int,
|
||||
alibi_slopes: Optional[torch.Tensor],
|
||||
kv_cache_dtype: str,
|
||||
k_scale: float,
|
||||
v_scale: float,
|
||||
tp_rank: int = 0,
|
||||
blocksparse_local_blocks: int = 0,
|
||||
blocksparse_vert_stride: int = 0,
|
||||
blocksparse_block_size: int = 64,
|
||||
blocksparse_head_sliding_step: int = 0,
|
||||
) -> None:
|
||||
assert kv_cache_dtype == "auto"
|
||||
num_heads = out.size(1)
|
||||
num_queries_per_tokens = num_heads // num_kv_heads
|
||||
ipex.llm.modules.PagedAttention.single_query_kv_attention(
|
||||
out,
|
||||
query.contiguous(),
|
||||
key_cache.view_as(value_cache),
|
||||
value_cache,
|
||||
num_queries_per_tokens,
|
||||
scale,
|
||||
block_tables,
|
||||
context_lens,
|
||||
block_size,
|
||||
max_context_len,
|
||||
alibi_slopes,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def paged_attention_v2(
|
||||
out: torch.Tensor,
|
||||
exp_sum: torch.Tensor,
|
||||
max_logits: torch.Tensor,
|
||||
tmp_out: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key_cache: torch.Tensor,
|
||||
value_cache: torch.Tensor,
|
||||
num_kv_heads: int,
|
||||
scale: float,
|
||||
block_tables: torch.Tensor,
|
||||
context_lens: torch.Tensor,
|
||||
block_size: int,
|
||||
max_context_len: int,
|
||||
alibi_slopes: Optional[torch.Tensor],
|
||||
kv_cache_dtype: str,
|
||||
k_scale: float,
|
||||
v_scale: float,
|
||||
tp_rank: int = 0,
|
||||
blocksparse_local_blocks: int = 0,
|
||||
blocksparse_vert_stride: int = 0,
|
||||
blocksparse_block_size: int = 64,
|
||||
blocksparse_head_sliding_step: int = 0,
|
||||
) -> None:
|
||||
assert kv_cache_dtype == "auto"
|
||||
num_heads = out.size(1)
|
||||
num_queries_per_tokens = num_heads // num_kv_heads
|
||||
ipex.llm.modules.PagedAttention.single_query_kv_attention(
|
||||
out,
|
||||
query.contiguous(),
|
||||
key_cache.view_as(value_cache),
|
||||
value_cache,
|
||||
num_queries_per_tokens,
|
||||
scale,
|
||||
block_tables,
|
||||
context_lens,
|
||||
block_size,
|
||||
max_context_len,
|
||||
alibi_slopes,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def rotary_embedding(
|
||||
positions: torch.Tensor, # [batch_size, seq_len]
|
||||
query: torch.Tensor, # [batch_size, seq_len, num_heads*head_size]
|
||||
key: torch.Tensor, # [batch_size, seq_len, num_kv_heads*head_size]
|
||||
head_size: int,
|
||||
cos_sin_cache: torch.Tensor, # [cos_sin_dim, rot_dim]
|
||||
is_neox: bool,
|
||||
) -> None:
|
||||
rot_dim = cos_sin_cache.size(1)
|
||||
ipex.llm.functional.rotary_embedding_batched(positions, query, key,
|
||||
head_size, cos_sin_cache,
|
||||
is_neox, rot_dim)
|
||||
|
||||
@staticmethod
|
||||
def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
|
||||
key: torch.Tensor, head_size: int,
|
||||
cos_sin_cache: torch.Tensor, is_neox: bool,
|
||||
rot_dim: int,
|
||||
cos_sin_cache_offsets: torch.Tensor) -> None:
|
||||
ipex.llm.functional.rotary_embedding_batched(positions, query, key,
|
||||
head_size, cos_sin_cache,
|
||||
is_neox, rot_dim,
|
||||
cos_sin_cache_offsets)
|
||||
|
||||
@staticmethod
|
||||
def rms_norm(input: torch.Tensor, weight: torch.Tensor,
|
||||
epsilon: float) -> torch.Tensor:
|
||||
return ipex.llm.functional.rms_norm(input, weight, epsilon)
|
||||
|
||||
@staticmethod
|
||||
def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
|
||||
weight: torch.Tensor, epsilon: float) -> None:
|
||||
tmp = ipex.llm.functional.add_rms_norm(residual, input, weight, None,
|
||||
epsilon, True)
|
||||
input.copy_(tmp)
|
||||
|
||||
@staticmethod
|
||||
def varlen_attention(
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
out: torch.Tensor,
|
||||
seqlen_q: torch.Tensor,
|
||||
seqlen_k: torch.Tensor,
|
||||
alibi_slopes: Optional[torch.Tensor],
|
||||
max_seqlen_q: int,
|
||||
max_seqlen_k: int,
|
||||
pdropout: float,
|
||||
softmax_scale: float,
|
||||
zero_tensors: bool,
|
||||
is_causal: bool,
|
||||
return_softmax: bool,
|
||||
gen_: torch.Generator,
|
||||
window_size_left: float,
|
||||
window_size_right: float,
|
||||
logits_soft_cap: float,
|
||||
) -> None:
|
||||
if ipex.__version__.endswith("cpu"):
|
||||
if logits_soft_cap != 0.0:
|
||||
raise ValueError("IPEX CPU does not support logits_soft_cap")
|
||||
assert alibi_slopes is None
|
||||
assert window_size_left < 0 and window_size_right < 0
|
||||
ipex.llm.functional.varlen_attention(query.contiguous(),
|
||||
key.contiguous(),
|
||||
value.contiguous(), out,
|
||||
seqlen_q.int(),
|
||||
seqlen_k.int(), max_seqlen_q,
|
||||
max_seqlen_k, pdropout,
|
||||
softmax_scale, zero_tensors,
|
||||
is_causal, return_softmax,
|
||||
gen_)
|
||||
else: # XPU build
|
||||
ipex.llm.functional.varlen_attention(
|
||||
query.contiguous(), key.contiguous(), value.contiguous(), out,
|
||||
seqlen_q.int(), seqlen_k.int(), alibi_slopes, max_seqlen_q,
|
||||
max_seqlen_k, pdropout, softmax_scale, zero_tensors, is_causal,
|
||||
return_softmax, gen_, window_size_left, window_size_right,
|
||||
logits_soft_cap)
|
||||
|
||||
@staticmethod
|
||||
def reshape_and_cache(
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
key_cache: torch.Tensor,
|
||||
value_cache: torch.Tensor,
|
||||
slot_mapping: torch.Tensor,
|
||||
kv_cache_dtype: str,
|
||||
k_scale: float,
|
||||
v_scale: float,
|
||||
) -> None:
|
||||
assert kv_cache_dtype == "auto"
|
||||
ipex.llm.modules.PagedAttention.reshape_and_cache(
|
||||
key, value, key_cache, value_cache, slot_mapping)
|
||||
|
||||
@staticmethod
|
||||
def reshape_and_cache_flash(
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
key_cache: torch.Tensor,
|
||||
value_cache: torch.Tensor,
|
||||
slot_mapping: torch.Tensor,
|
||||
kv_cache_dtype: str,
|
||||
k_scale: Optional[torch.Tensor] = None,
|
||||
v_scale: Optional[torch.Tensor] = None,
|
||||
k_scale_float: float = 1.0,
|
||||
v_scale_float: float = 1.0,
|
||||
) -> None:
|
||||
assert kv_cache_dtype == "auto"
|
||||
# TODO: support FP8 kv cache.
|
||||
ipex.llm.modules.PagedAttention.reshape_and_cache_flash(
|
||||
key, value, key_cache, value_cache, slot_mapping)
|
||||
|
||||
@staticmethod
|
||||
def flash_attn_varlen_func(
|
||||
out: torch.Tensor,
|
||||
q: torch.Tensor,
|
||||
k: torch.Tensor,
|
||||
v: torch.Tensor,
|
||||
cu_seqlens_q: torch.Tensor,
|
||||
seqused_k: torch.Tensor, # we don't support this in ipex kernel
|
||||
max_seqlen_q: int,
|
||||
max_seqlen_k: int,
|
||||
softmax_scale: float,
|
||||
causal: bool,
|
||||
block_table: torch.Tensor,
|
||||
alibi_slopes: Optional[torch.Tensor],
|
||||
window_size: Optional[list[int]] = None,
|
||||
softcap: Optional[float] = 0.0,
|
||||
cu_seqlens_k: Optional[torch.Tensor] = None,
|
||||
# The following parameters are not used in ipex kernel currently,
|
||||
# we keep API compatible to CUDA's.
|
||||
scheduler_metadata=None,
|
||||
fa_version: int = 2,
|
||||
q_descale=None,
|
||||
k_descale=None,
|
||||
v_descale=None,
|
||||
num_splits=0,
|
||||
):
|
||||
if cu_seqlens_k is None:
|
||||
# cu_seqlens_k is not used in ipex kernel.
|
||||
cu_seqlens_k = torch.cumsum(seqused_k, dim=0)
|
||||
cu_seqlens_k = torch.cat([
|
||||
torch.tensor([0], device=seqused_k.device, dtype=torch.int32),
|
||||
cu_seqlens_k
|
||||
]).to(torch.int32)
|
||||
|
||||
real_window_size: tuple[int, int]
|
||||
if window_size is None:
|
||||
real_window_size = (-1, -1)
|
||||
else:
|
||||
assert len(window_size) == 2
|
||||
real_window_size = (window_size[0], window_size[1])
|
||||
return ipex.llm.modules.PagedAttention.flash_attn_varlen_func(
|
||||
out,
|
||||
q.contiguous(),
|
||||
k,
|
||||
v,
|
||||
cu_seqlens_q,
|
||||
cu_seqlens_k,
|
||||
max_seqlen_q,
|
||||
max_seqlen_k,
|
||||
softmax_scale,
|
||||
causal,
|
||||
block_table,
|
||||
alibi_slopes,
|
||||
softcap=softcap,
|
||||
window_size_left=real_window_size[0],
|
||||
window_size_right=real_window_size[1],
|
||||
k_scale=1.0,
|
||||
v_scale=1.0,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def get_scheduler_metadata(
|
||||
batch_size,
|
||||
max_seqlen_q,
|
||||
max_seqlen_k,
|
||||
num_heads_q,
|
||||
num_heads_kv,
|
||||
headdim,
|
||||
cache_seqlens: torch.Tensor,
|
||||
qkv_dtype=torch.bfloat16,
|
||||
headdim_v=None,
|
||||
cu_seqlens_q: Optional[torch.Tensor] = None,
|
||||
cu_seqlens_k_new: Optional[torch.Tensor] = None,
|
||||
cache_leftpad: Optional[torch.Tensor] = None,
|
||||
page_size: Optional[int] = None,
|
||||
max_seqlen_k_new=0,
|
||||
causal=False,
|
||||
window_size=(-1, -1), # -1 means infinite context window
|
||||
has_softcap=False,
|
||||
num_splits=0, # Can be tuned for speed
|
||||
pack_gqa=None, # Can be tuned for speed
|
||||
sm_margin=0, # Can be tuned if some SMs are used for communication
|
||||
) -> None:
|
||||
logger.warning_once(
|
||||
"get_scheduler_metadata is not implemented for ipex_ops, "
|
||||
"returning None.")
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def copy_blocks(key_caches: list[torch.Tensor],
|
||||
value_caches: list[torch.Tensor],
|
||||
block_mapping: torch.Tensor) -> None:
|
||||
torch.xpu.copy_blocks( # type: ignore
|
||||
key_caches,
|
||||
value_caches,
|
||||
block_mapping,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def swap_blocks(src: torch.Tensor, dst: torch.Tensor,
|
||||
block_mapping: torch.Tensor) -> None:
|
||||
torch.xpu.swap_blocks(src, dst, block_mapping) # type: ignore
|
||||
BIN
vllm/_moe_C.abi3.so
Executable file
BIN
vllm/_moe_C.abi3.so
Executable file
Binary file not shown.
0
vllm/adapter_commons/__init__.py
Normal file
0
vllm/adapter_commons/__init__.py
Normal file
16
vllm/adapter_commons/layers.py
Normal file
16
vllm/adapter_commons/layers.py
Normal file
@@ -0,0 +1,16 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class AdapterMapping:
|
||||
# Per every token in input_ids:
|
||||
index_mapping: tuple[int, ...]
|
||||
# Per sampled token:
|
||||
prompt_mapping: tuple[int, ...]
|
||||
|
||||
def __post_init__(self):
|
||||
self.index_mapping = tuple(self.index_mapping)
|
||||
self.prompt_mapping = tuple(self.prompt_mapping)
|
||||
106
vllm/adapter_commons/models.py
Normal file
106
vllm/adapter_commons/models.py
Normal file
@@ -0,0 +1,106 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Callable, Optional, TypeVar
|
||||
|
||||
from torch import nn
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import LRUCache
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class AdapterModel(ABC):
|
||||
|
||||
def __init__(self, model_id=None):
|
||||
self.id = model_id
|
||||
|
||||
@abstractmethod
|
||||
def from_local_checkpoint(cls, model_dir, model_id=None, **kwargs):
|
||||
# Common initialization code
|
||||
# Load weights or embeddings from local checkpoint
|
||||
raise NotImplementedError("Subclasses must implement this method.")
|
||||
|
||||
|
||||
T = TypeVar('T')
|
||||
|
||||
|
||||
class AdapterLRUCache(LRUCache[int, T]):
|
||||
|
||||
def __init__(self, capacity: int, deactivate_fn: Callable[[int], object]):
|
||||
super().__init__(capacity)
|
||||
self.deactivate_fn = deactivate_fn
|
||||
|
||||
def _on_remove(self, key: int, value: Optional[T]):
|
||||
logger.debug("Removing adapter int id: %d", key)
|
||||
self.deactivate_fn(key)
|
||||
return super()._on_remove(key, value)
|
||||
|
||||
|
||||
class AdapterModelManager(ABC):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: nn.Module,
|
||||
):
|
||||
"""Create a AdapterModelManager and adapter for a given model.
|
||||
Args:
|
||||
model: the model to be adapted.
|
||||
"""
|
||||
self.model: nn.Module = model
|
||||
self._registered_adapters: dict[int, Any] = {}
|
||||
# Dict instead of a Set for compatibility with LRUCache.
|
||||
self._active_adapters: dict[int, None] = {}
|
||||
self.adapter_type = 'Adapter'
|
||||
self._last_mapping = None
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self._registered_adapters)
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def adapter_slots(self) -> int:
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def capacity(self) -> int:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def activate_adapter(self, adapter_id: int) -> bool:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def deactivate_adapter(self, adapter_id: int) -> bool:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def add_adapter(self, adapter: Any) -> bool:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def set_adapter_mapping(self, mapping: Any) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def remove_adapter(self, adapter_id: int) -> bool:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def remove_all_adapters(self) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def get_adapter(self, adapter_id: int) -> Optional[Any]:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def list_adapters(self) -> dict[int, Any]:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def pin_adapter(self, adapter_id: int) -> bool:
|
||||
raise NotImplementedError
|
||||
26
vllm/adapter_commons/request.py
Normal file
26
vllm/adapter_commons/request.py
Normal file
@@ -0,0 +1,26 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
|
||||
class AdapterRequest(ABC):
|
||||
"""
|
||||
Base class for adapter requests.
|
||||
"""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def adapter_id(self) -> int:
|
||||
raise NotImplementedError
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if self.adapter_id < 1:
|
||||
raise ValueError(f"id must be > 0, got {self.adapter_id}")
|
||||
|
||||
def __eq__(self, value: object) -> bool:
|
||||
return isinstance(
|
||||
value, self.__class__) and self.adapter_id == value.adapter_id
|
||||
|
||||
def __hash__(self) -> int:
|
||||
return hash(self.adapter_id)
|
||||
93
vllm/adapter_commons/utils.py
Normal file
93
vllm/adapter_commons/utils.py
Normal file
@@ -0,0 +1,93 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Any, Callable, Optional
|
||||
|
||||
|
||||
## model functions
|
||||
def deactivate_adapter(adapter_id: int, active_adapters: dict[int, None],
|
||||
deactivate_func: Callable) -> bool:
|
||||
if adapter_id in active_adapters:
|
||||
deactivate_func(adapter_id)
|
||||
active_adapters.pop(adapter_id)
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def add_adapter(adapter: Any, registered_adapters: dict[int, Any],
|
||||
capacity: int, add_func: Callable) -> bool:
|
||||
if adapter.id not in registered_adapters:
|
||||
if len(registered_adapters) >= capacity:
|
||||
raise RuntimeError('No free adapter slots.')
|
||||
add_func(adapter)
|
||||
registered_adapters[adapter.id] = adapter
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def set_adapter_mapping(mapping: Any, last_mapping: Any,
|
||||
set_mapping_func: Callable) -> Any:
|
||||
if last_mapping != mapping:
|
||||
set_mapping_func(mapping)
|
||||
return mapping
|
||||
return last_mapping
|
||||
|
||||
|
||||
def remove_adapter(adapter_id: int, registered_adapters: dict[int, Any],
|
||||
deactivate_func: Callable) -> bool:
|
||||
deactivate_func(adapter_id)
|
||||
return bool(registered_adapters.pop(adapter_id, None))
|
||||
|
||||
|
||||
def list_adapters(registered_adapters: dict[int, Any]) -> dict[int, Any]:
|
||||
return dict(registered_adapters)
|
||||
|
||||
|
||||
def get_adapter(adapter_id: int,
|
||||
registered_adapters: dict[int, Any]) -> Optional[Any]:
|
||||
return registered_adapters.get(adapter_id)
|
||||
|
||||
|
||||
## worker functions
|
||||
def set_active_adapters_worker(requests: set[Any], mapping: Optional[Any],
|
||||
apply_adapters_func,
|
||||
set_adapter_mapping_func) -> None:
|
||||
apply_adapters_func(requests)
|
||||
set_adapter_mapping_func(mapping)
|
||||
|
||||
|
||||
def add_adapter_worker(adapter_request: Any, list_adapters_func,
|
||||
load_adapter_func, add_adapter_func,
|
||||
activate_adapter_func) -> bool:
|
||||
if adapter_request.adapter_id in list_adapters_func():
|
||||
return False
|
||||
loaded_adapter = load_adapter_func(adapter_request)
|
||||
loaded = add_adapter_func(loaded_adapter)
|
||||
activate_adapter_func(loaded_adapter.id)
|
||||
return loaded
|
||||
|
||||
|
||||
def apply_adapters_worker(adapter_requests: set[Any], list_adapters_func,
|
||||
adapter_slots: int, remove_adapter_func,
|
||||
add_adapter_func) -> None:
|
||||
models_that_exist = list_adapters_func()
|
||||
models_map = {
|
||||
adapter_request.adapter_id: adapter_request
|
||||
for adapter_request in adapter_requests if adapter_request
|
||||
}
|
||||
if len(models_map) > adapter_slots:
|
||||
raise RuntimeError(
|
||||
f"Number of requested models ({len(models_map)}) is greater "
|
||||
f"than the number of GPU model slots "
|
||||
f"({adapter_slots}).")
|
||||
new_models = set(models_map)
|
||||
models_to_add = new_models - models_that_exist
|
||||
models_to_remove = models_that_exist - new_models
|
||||
for adapter_id in models_to_remove:
|
||||
remove_adapter_func(adapter_id)
|
||||
for adapter_id in models_to_add:
|
||||
add_adapter_func(models_map[adapter_id])
|
||||
|
||||
|
||||
def list_adapters_worker(adapter_manager_list_adapters_func) -> set[int]:
|
||||
return set(adapter_manager_list_adapters_func())
|
||||
39
vllm/adapter_commons/worker_manager.py
Normal file
39
vllm/adapter_commons/worker_manager.py
Normal file
@@ -0,0 +1,39 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Optional
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
class AbstractWorkerManager(ABC):
|
||||
|
||||
def __init__(self, device: torch.device):
|
||||
self.device = device
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def is_enabled(self) -> bool:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def set_active_adapters(self, requests: set[Any],
|
||||
mapping: Optional[Any]) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def add_adapter(self, adapter_request: Any) -> bool:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def remove_adapter(self, adapter_id: int) -> bool:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def remove_all_adapters(self) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def list_adapters(self) -> set[int]:
|
||||
raise NotImplementedError
|
||||
0
vllm/assets/__init__.py
Normal file
0
vllm/assets/__init__.py
Normal file
45
vllm/assets/audio.py
Normal file
45
vllm/assets/audio.py
Normal file
@@ -0,0 +1,45 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Literal
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import numpy.typing as npt
|
||||
|
||||
from vllm.utils import PlaceholderModule
|
||||
|
||||
from .base import VLLM_S3_BUCKET_URL, get_vllm_public_assets
|
||||
|
||||
try:
|
||||
import librosa
|
||||
except ImportError:
|
||||
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
|
||||
|
||||
ASSET_DIR = "multimodal_asset"
|
||||
|
||||
AudioAssetName = Literal["winning_call", "mary_had_lamb"]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class AudioAsset:
|
||||
name: AudioAssetName
|
||||
|
||||
@property
|
||||
def filename(self) -> str:
|
||||
return f"{self.name}.ogg"
|
||||
|
||||
@property
|
||||
def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]:
|
||||
audio_path = get_vllm_public_assets(filename=self.filename,
|
||||
s3_prefix=ASSET_DIR)
|
||||
return librosa.load(audio_path, sr=None)
|
||||
|
||||
def get_local_path(self) -> Path:
|
||||
return get_vllm_public_assets(filename=self.filename,
|
||||
s3_prefix=ASSET_DIR)
|
||||
|
||||
@property
|
||||
def url(self) -> str:
|
||||
return urljoin(VLLM_S3_BUCKET_URL, f"{ASSET_DIR}/{self.name}.ogg")
|
||||
41
vllm/assets/base.py
Normal file
41
vllm/assets/base.py
Normal file
@@ -0,0 +1,41 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.connections import global_http_connection
|
||||
|
||||
VLLM_S3_BUCKET_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com"
|
||||
|
||||
|
||||
def get_cache_dir() -> Path:
|
||||
"""Get the path to the cache for storing downloaded assets."""
|
||||
path = Path(envs.VLLM_ASSETS_CACHE)
|
||||
path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
return path
|
||||
|
||||
|
||||
@lru_cache
|
||||
def get_vllm_public_assets(filename: str,
|
||||
s3_prefix: Optional[str] = None) -> Path:
|
||||
"""
|
||||
Download an asset file from ``s3://vllm-public-assets``
|
||||
and return the path to the downloaded file.
|
||||
"""
|
||||
asset_directory = get_cache_dir() / "vllm_public_assets"
|
||||
asset_directory.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
asset_path = asset_directory / filename
|
||||
if not asset_path.exists():
|
||||
if s3_prefix is not None:
|
||||
filename = s3_prefix + "/" + filename
|
||||
global_http_connection.download_file(
|
||||
f"{VLLM_S3_BUCKET_URL}/{filename}",
|
||||
asset_path,
|
||||
timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT)
|
||||
|
||||
return asset_path
|
||||
34
vllm/assets/image.py
Normal file
34
vllm/assets/image.py
Normal file
@@ -0,0 +1,34 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal
|
||||
|
||||
import torch
|
||||
from PIL import Image
|
||||
|
||||
from .base import get_vllm_public_assets
|
||||
|
||||
VLM_IMAGES_DIR = "vision_model_images"
|
||||
|
||||
ImageAssetName = Literal["stop_sign", "cherry_blossom"]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ImageAsset:
|
||||
name: ImageAssetName
|
||||
|
||||
@property
|
||||
def pil_image(self) -> Image.Image:
|
||||
image_path = get_vllm_public_assets(filename=f"{self.name}.jpg",
|
||||
s3_prefix=VLM_IMAGES_DIR)
|
||||
return Image.open(image_path)
|
||||
|
||||
@property
|
||||
def image_embeds(self) -> torch.Tensor:
|
||||
"""
|
||||
Image embeddings, only used for testing purposes with llava 1.5.
|
||||
"""
|
||||
image_path = get_vllm_public_assets(filename=f"{self.name}.pt",
|
||||
s3_prefix=VLM_IMAGES_DIR)
|
||||
return torch.load(image_path, map_location="cpu", weights_only=True)
|
||||
139
vllm/assets/video.py
Normal file
139
vllm/assets/video.py
Normal file
@@ -0,0 +1,139 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from dataclasses import dataclass
|
||||
from functools import lru_cache
|
||||
from typing import Any, ClassVar, Literal, Optional
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import numpy.typing as npt
|
||||
from huggingface_hub import hf_hub_download
|
||||
from PIL import Image
|
||||
|
||||
from vllm.utils import PlaceholderModule
|
||||
|
||||
from .base import get_cache_dir
|
||||
|
||||
try:
|
||||
import librosa
|
||||
except ImportError:
|
||||
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
|
||||
|
||||
|
||||
@lru_cache
|
||||
def download_video_asset(filename: str) -> str:
|
||||
"""
|
||||
Download and open an image from huggingface
|
||||
repo: raushan-testing-hf/videos-test
|
||||
"""
|
||||
video_directory = get_cache_dir() / "video-example-data"
|
||||
video_directory.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
video_path = video_directory / filename
|
||||
video_path_str = str(video_path)
|
||||
if not video_path.exists():
|
||||
video_path_str = hf_hub_download(
|
||||
repo_id="raushan-testing-hf/videos-test",
|
||||
filename=filename,
|
||||
repo_type="dataset",
|
||||
cache_dir=video_directory,
|
||||
)
|
||||
return video_path_str
|
||||
|
||||
|
||||
def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
|
||||
cap = cv2.VideoCapture(path)
|
||||
if not cap.isOpened():
|
||||
raise ValueError(f"Could not open video file {path}")
|
||||
|
||||
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
frames = []
|
||||
|
||||
num_frames = num_frames if num_frames > 0 else total_frames
|
||||
frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
|
||||
for idx in range(total_frames):
|
||||
ok = cap.grab() # next img
|
||||
if not ok:
|
||||
break
|
||||
if idx in frame_indices: # only decompress needed
|
||||
ret, frame = cap.retrieve()
|
||||
if ret:
|
||||
frames.append(frame)
|
||||
|
||||
frames = np.stack(frames)
|
||||
if len(frames) < num_frames:
|
||||
raise ValueError(f"Could not read enough frames from video file {path}"
|
||||
f" (expected {num_frames} frames, got {len(frames)})")
|
||||
return frames
|
||||
|
||||
|
||||
def video_to_pil_images_list(path: str,
|
||||
num_frames: int = -1) -> list[Image.Image]:
|
||||
frames = video_to_ndarrays(path, num_frames)
|
||||
return [
|
||||
Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
||||
for frame in frames
|
||||
]
|
||||
|
||||
|
||||
def video_get_metadata(path: str) -> dict[str, Any]:
|
||||
cap = cv2.VideoCapture(path)
|
||||
if not cap.isOpened():
|
||||
raise ValueError(f"Could not open video file {path}")
|
||||
|
||||
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
fps = cap.get(cv2.CAP_PROP_FPS)
|
||||
duration = total_frames / fps if fps > 0 else 0
|
||||
|
||||
metadata = {
|
||||
"total_num_frames": total_frames,
|
||||
"fps": fps,
|
||||
"duration": duration,
|
||||
"video_backend": "opencv"
|
||||
}
|
||||
return metadata
|
||||
|
||||
|
||||
VideoAssetName = Literal["baby_reading"]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class VideoAsset:
|
||||
name: VideoAssetName
|
||||
num_frames: int = -1
|
||||
|
||||
_NAME_TO_FILE: ClassVar[dict[VideoAssetName, str]] = {
|
||||
"baby_reading": "sample_demo_1.mp4",
|
||||
}
|
||||
|
||||
@property
|
||||
def filename(self) -> str:
|
||||
return self._NAME_TO_FILE[self.name]
|
||||
|
||||
@property
|
||||
def pil_images(self) -> list[Image.Image]:
|
||||
video_path = download_video_asset(self.filename)
|
||||
ret = video_to_pil_images_list(video_path, self.num_frames)
|
||||
return ret
|
||||
|
||||
@property
|
||||
def np_ndarrays(self) -> npt.NDArray:
|
||||
video_path = download_video_asset(self.filename)
|
||||
ret = video_to_ndarrays(video_path, self.num_frames)
|
||||
return ret
|
||||
|
||||
@property
|
||||
def metadata(self) -> dict[str, Any]:
|
||||
video_path = download_video_asset(self.filename)
|
||||
ret = video_get_metadata(video_path)
|
||||
return ret
|
||||
|
||||
def get_audio(self, sampling_rate: Optional[float] = None) -> npt.NDArray:
|
||||
"""
|
||||
Read audio data from the video asset, used in Qwen2.5-Omni examples.
|
||||
|
||||
See also: examples/offline_inference/qwen2_5_omni/only_thinker.py
|
||||
"""
|
||||
video_path = download_video_asset(self.filename)
|
||||
return librosa.load(video_path, sr=sampling_rate)[0]
|
||||
20
vllm/attention/__init__.py
Normal file
20
vllm/attention/__init__.py
Normal file
@@ -0,0 +1,20 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from vllm.attention.backends.abstract import (AttentionBackend,
|
||||
AttentionMetadata,
|
||||
AttentionMetadataBuilder,
|
||||
AttentionState, AttentionType)
|
||||
from vllm.attention.layer import Attention
|
||||
from vllm.attention.selector import get_attn_backend
|
||||
|
||||
__all__ = [
|
||||
"Attention",
|
||||
"AttentionBackend",
|
||||
"AttentionMetadata",
|
||||
"AttentionType",
|
||||
"AttentionMetadataBuilder",
|
||||
"Attention",
|
||||
"AttentionState",
|
||||
"get_attn_backend",
|
||||
]
|
||||
0
vllm/attention/backends/__init__.py
Normal file
0
vllm/attention/backends/__init__.py
Normal file
325
vllm/attention/backends/abstract.py
Normal file
325
vllm/attention/backends/abstract.py
Normal file
@@ -0,0 +1,325 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass, fields
|
||||
from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional,
|
||||
Protocol, Set, Tuple, Type, TypeVar)
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.multimodal import MultiModalPlaceholderMap
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.worker.model_runner_base import (ModelRunnerBase,
|
||||
ModelRunnerInputBase,
|
||||
ModelRunnerInputBuilderBase)
|
||||
|
||||
|
||||
class AttentionType:
|
||||
"""
|
||||
Attention type.
|
||||
Use string to be compatible with `torch.compile`.
|
||||
"""
|
||||
# Decoder attention between previous layer Q/K/V
|
||||
DECODER = "decoder"
|
||||
# Encoder attention between previous layer Q/K/V for encoder-decoder
|
||||
ENCODER = "encoder"
|
||||
# Encoder attention between previous layer Q/K/V
|
||||
ENCODER_ONLY = "encoder_only"
|
||||
# Attention between dec. Q and enc. K/V for encoder-decoder
|
||||
ENCODER_DECODER = "encoder_decoder"
|
||||
|
||||
|
||||
class AttentionBackend(ABC):
|
||||
"""Abstract class for attention backends."""
|
||||
# For some attention backends, we allocate an output tensor before
|
||||
# calling the custom op. When piecewise cudagraph is enabled, this
|
||||
# makes sure the output tensor is allocated inside the cudagraph.
|
||||
accept_output_buffer: bool = False
|
||||
|
||||
@staticmethod
|
||||
@abstractmethod
|
||||
def get_name() -> str:
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
@abstractmethod
|
||||
def get_impl_cls() -> Type["AttentionImpl"]:
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
@abstractmethod
|
||||
def get_metadata_cls() -> Type["AttentionMetadata"]:
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
@abstractmethod
|
||||
def get_state_cls() -> Type["AttentionState"]:
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def make_metadata(cls, *args, **kwargs) -> "AttentionMetadata":
|
||||
return cls.get_metadata_cls()(*args, **kwargs)
|
||||
|
||||
@staticmethod
|
||||
@abstractmethod
|
||||
def get_builder_cls() -> Type["AttentionMetadataBuilder"]:
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
@abstractmethod
|
||||
def get_kv_cache_shape(
|
||||
num_blocks: int,
|
||||
block_size: int,
|
||||
num_kv_heads: int,
|
||||
head_size: int,
|
||||
) -> Tuple[int, ...]:
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
def get_kv_cache_stride_order() -> Tuple[int, ...]:
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
@abstractmethod
|
||||
def swap_blocks(
|
||||
src_kv_cache: torch.Tensor,
|
||||
dst_kv_cache: torch.Tensor,
|
||||
src_to_dst: torch.Tensor,
|
||||
) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
@abstractmethod
|
||||
def copy_blocks(
|
||||
kv_caches: List[torch.Tensor],
|
||||
src_to_dists: torch.Tensor,
|
||||
) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
def advance_step(self, model_input: "ModelRunnerInputBase",
|
||||
sampled_token_ids: Optional[torch.Tensor],
|
||||
block_size: int, num_seqs: int, num_queries: int) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@dataclass
|
||||
class AttentionMetadata:
|
||||
"""Attention metadata for prefill and decode batched together."""
|
||||
# Total number of prefill requests.
|
||||
num_prefills: int
|
||||
# Number of prefill tokens.
|
||||
num_prefill_tokens: int
|
||||
# Number of decode tokens. Note that it is equivalent to the number of
|
||||
# decode requests.
|
||||
num_decode_tokens: int
|
||||
# (num_tokens,). The indices of the token slots that input tokens will be
|
||||
# stored into. E.g., if `slot_mapping` is [35, 2, 17] and the block size
|
||||
# is 16, the three tokens are stored in the 3rd slot in block 2, 2nd slot
|
||||
# in block 0, and 1st slot in block 1, respectively.
|
||||
slot_mapping: torch.Tensor
|
||||
|
||||
# The index maps that relate multi-modal embeddings to the corresponding
|
||||
# placeholders.
|
||||
#
|
||||
# N.B. These aren't really related to attention and don't belong on this
|
||||
# type -- this is just a temporary solution to make them available to
|
||||
# `model_executable`.
|
||||
multi_modal_placeholder_index_maps: Optional[Dict[
|
||||
str, MultiModalPlaceholderMap.IndexMap]]
|
||||
|
||||
# Enable/disable KV scales calculation. This is so that we can disable the
|
||||
# calculation until after prefill and cuda graph capture.
|
||||
enable_kv_scales_calculation: bool
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def prefill_metadata(self) -> Optional["AttentionMetadata"]:
|
||||
"""Return the attention metadata that's required to run prefill
|
||||
attention."""
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def decode_metadata(self) -> Optional["AttentionMetadata"]:
|
||||
"""Return the attention metadata that's required to run decode
|
||||
attention."""
|
||||
pass
|
||||
|
||||
def asdict_zerocopy(self,
|
||||
skip_fields: Optional[Set[str]] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Similar to dataclasses.asdict, but avoids deepcopying."""
|
||||
if skip_fields is None:
|
||||
skip_fields = set()
|
||||
# Note that if we add dataclasses as fields, they will need
|
||||
# similar handling.
|
||||
return {
|
||||
field.name: getattr(self, field.name)
|
||||
for field in fields(self) if field.name not in skip_fields
|
||||
}
|
||||
|
||||
|
||||
T = TypeVar("T", bound=AttentionMetadata)
|
||||
|
||||
|
||||
class AttentionState(ABC, Generic[T]):
|
||||
"""Holds attention backend-specific objects reused during the
|
||||
lifetime of the model runner."""
|
||||
|
||||
@abstractmethod
|
||||
def __init__(self, runner: "ModelRunnerBase"):
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
@contextmanager
|
||||
def graph_capture(self, max_batch_size: int):
|
||||
"""Context manager used when capturing CUDA graphs."""
|
||||
yield
|
||||
|
||||
@abstractmethod
|
||||
def graph_clone(self, batch_size: int) -> "AttentionState[T]":
|
||||
"""Clone attention state to save in CUDA graph metadata."""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def graph_capture_get_metadata_for_batch(
|
||||
self,
|
||||
batch_size: int,
|
||||
is_encoder_decoder_model: bool = False) -> T:
|
||||
"""Get attention metadata for CUDA graph capture of batch_size."""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def get_graph_input_buffers(
|
||||
self,
|
||||
attn_metadata: T,
|
||||
is_encoder_decoder_model: bool = False) -> Dict[str, Any]:
|
||||
"""Get attention-specific input buffers for CUDA graph capture."""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def prepare_graph_input_buffers(
|
||||
self,
|
||||
input_buffers: Dict[str, Any],
|
||||
attn_metadata: T,
|
||||
is_encoder_decoder_model: bool = False) -> None:
|
||||
"""In-place modify input buffers dict for CUDA graph replay."""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def begin_forward(self, model_input: "ModelRunnerInputBase") -> None:
|
||||
"""Prepare state for forward pass."""
|
||||
...
|
||||
|
||||
|
||||
class AttentionMetadataBuilder(ABC, Generic[T]):
|
||||
"""Abstract class for attention metadata builders."""
|
||||
|
||||
@abstractmethod
|
||||
def __init__(self, input_builder: "ModelRunnerInputBuilderBase") -> None:
|
||||
"""Create the builder, remember some configuration and parameters."""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def prepare(self) -> None:
|
||||
"""Prepare for one batch."""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def build(self, seq_lens: List[int], query_lens: List[int],
|
||||
cuda_graph_pad_size: int, batch_size: int) -> T:
|
||||
"""Build attention metadata with on-device tensors."""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class AttentionLayer(Protocol):
|
||||
|
||||
_q_scale: torch.Tensor
|
||||
_k_scale: torch.Tensor
|
||||
_v_scale: torch.Tensor
|
||||
_k_scale_float: float
|
||||
_v_scale_float: float
|
||||
_prob_scale: torch.Tensor
|
||||
|
||||
def forward(
|
||||
self,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
kv_cache: torch.Tensor,
|
||||
attn_metadata: AttentionMetadata,
|
||||
) -> torch.Tensor:
|
||||
...
|
||||
|
||||
|
||||
class AttentionImpl(ABC, Generic[T]):
|
||||
|
||||
@abstractmethod
|
||||
def __init__(
|
||||
self,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
scale: float,
|
||||
num_kv_heads: Optional[int] = None,
|
||||
alibi_slopes: Optional[List[float]] = None,
|
||||
sliding_window: Optional[int] = None,
|
||||
kv_cache_dtype: str = "auto",
|
||||
blocksparse_params: Optional[Dict[str, Any]] = None,
|
||||
logits_soft_cap: Optional[float] = None,
|
||||
attn_type: str = AttentionType.DECODER,
|
||||
kv_sharing_target_layer_name: Optional[str] = None,
|
||||
) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def forward(
|
||||
self,
|
||||
layer: AttentionLayer,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
kv_cache: torch.Tensor,
|
||||
attn_metadata: T,
|
||||
output: Optional[torch.Tensor] = None,
|
||||
output_scale: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
raise NotImplementedError
|
||||
|
||||
def fused_output_quant_supported(self, dtype: torch.dtype, static: bool,
|
||||
group_shape: tuple[int, int]):
|
||||
"""
|
||||
Does this attention implementation support fused output quantization.
|
||||
This is used by the AttnFusionPass to only fuse output quantization
|
||||
onto implementations that support it.
|
||||
|
||||
TODO(luka) merge parameters into QuantDescriptor
|
||||
:param dtype: quantized dtype
|
||||
:param static: static or dynamic quantization
|
||||
:param group_shape: quant group shape. (-1, -1) for per-tensor.
|
||||
:return: is fusion supported for this type of quantization
|
||||
"""
|
||||
return False
|
||||
|
||||
|
||||
class MLAAttentionImpl(AttentionImpl[T], Generic[T]):
|
||||
|
||||
@abstractmethod
|
||||
def forward(
|
||||
self,
|
||||
layer: AttentionLayer,
|
||||
hidden_states_or_cq: torch.Tensor,
|
||||
kv_c_normed: torch.Tensor,
|
||||
k_pe: torch.Tensor,
|
||||
kv_cache: torch.Tensor,
|
||||
attn_metadata: T,
|
||||
output: Optional[torch.Tensor] = None,
|
||||
output_scale: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def is_quantized_kv_cache(kv_cache_dtype: str) -> bool:
|
||||
return kv_cache_dtype != "auto"
|
||||
469
vllm/attention/backends/blocksparse_attn.py
Normal file
469
vllm/attention/backends/blocksparse_attn.py
Normal file
@@ -0,0 +1,469 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, List, Optional, Tuple, Type
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
|
||||
AttentionLayer,
|
||||
AttentionMetadata, AttentionType)
|
||||
from vllm.attention.backends.utils import (CommonAttentionState,
|
||||
CommonMetadataBuilder)
|
||||
from vllm.attention.ops.blocksparse_attention.interface import (
|
||||
LocalStridedBlockSparseAttn, get_head_sliding_step)
|
||||
from vllm.attention.ops.paged_attn import PagedAttention
|
||||
from vllm.distributed import (get_tensor_model_parallel_rank,
|
||||
get_tensor_model_parallel_world_size)
|
||||
|
||||
|
||||
@dataclass
|
||||
class BlocksparseParams:
|
||||
max_seqlen: int
|
||||
|
||||
# Num q heads per tensor-parallel rank/partition
|
||||
num_heads: int # per TP partition
|
||||
# Num kv heads per tensor-parallel rank/partition
|
||||
num_kv_heads: int
|
||||
|
||||
# block size used for blocksparse attention.
|
||||
# This is the block_size used in `local_blocks`, `vert_stride`.
|
||||
block_size: int
|
||||
|
||||
# Number of blocks for local attention, i.e., number of
|
||||
# local attended tokens / `sparse_block_size`
|
||||
local_blocks: int
|
||||
|
||||
# Attend to one block per every `vert_stride` blocks.
|
||||
# Controlling the sparsity
|
||||
vert_stride: int
|
||||
"""
|
||||
If to use the same vertical stride offset for all heads,
|
||||
i.e., attend to the same block of tokens on all heads.
|
||||
By default, it is False, i.e., attention on the non-local
|
||||
blocks depends on the `head_idx`, that is on
|
||||
blocks satisfying
|
||||
`(block_idx + head_idx * head_sliding_step + 1) % vert_stride == 0`
|
||||
where `head_sliding_step=max(1, int(vert_stride / num_total_heads))`,
|
||||
`block_idx = position_id // sparse_block_size`.
|
||||
See `..ops.blocksparse_attention.utils:get_sparse_attn_mask`
|
||||
for more detail.
|
||||
"""
|
||||
homo_head: bool = False
|
||||
|
||||
# If within a group, the kv offsets that each q attends is the same or no.
|
||||
homo_head_group: bool = False
|
||||
|
||||
# Decided by homo_head and homo_head group
|
||||
head_sliding_step: int = field(init=False)
|
||||
|
||||
# range of q heads to for a TP rank
|
||||
active_head_range: Tuple = field(init=False)
|
||||
|
||||
def __post_init__(self):
|
||||
assert self.block_size > 0
|
||||
assert self.local_blocks >= 0
|
||||
assert self.vert_stride >= 1
|
||||
|
||||
tp_size = get_tensor_model_parallel_world_size()
|
||||
tp_rank = get_tensor_model_parallel_rank()
|
||||
total_heads = tp_size * self.num_heads
|
||||
total_kv_heads = tp_size * self.num_kv_heads
|
||||
|
||||
if self.homo_head:
|
||||
self.head_sliding_step = 0
|
||||
elif self.homo_head_group:
|
||||
head_sliding_step = get_head_sliding_step(total_kv_heads,
|
||||
self.vert_stride)
|
||||
# negative indicates sliding along kv heads, i.e., homo q group
|
||||
self.head_sliding_step = -head_sliding_step
|
||||
else:
|
||||
self.head_sliding_step = get_head_sliding_step(
|
||||
total_heads, self.vert_stride)
|
||||
|
||||
self.active_head_range = (
|
||||
tp_rank * self.num_heads,
|
||||
(tp_rank + 1) * self.num_heads,
|
||||
)
|
||||
|
||||
|
||||
class BlocksparseFlashAttentionBackend(AttentionBackend):
|
||||
|
||||
@staticmethod
|
||||
def get_name() -> str:
|
||||
return "BLOCK_SPARSE_FLASH_ATTN"
|
||||
|
||||
@staticmethod
|
||||
def get_impl_cls() -> Type["BlocksparseFlashAttentionImpl"]:
|
||||
return BlocksparseFlashAttentionImpl
|
||||
|
||||
@staticmethod
|
||||
def get_metadata_cls() -> Type["AttentionMetadata"]:
|
||||
return BlocksparseFlashAttentionMetadata
|
||||
|
||||
@staticmethod
|
||||
def get_builder_cls() -> Type["BlocksparseFlashAttentionMetadataBuilder"]:
|
||||
return BlocksparseFlashAttentionMetadataBuilder
|
||||
|
||||
@staticmethod
|
||||
def get_state_cls() -> Type["CommonAttentionState"]:
|
||||
return CommonAttentionState
|
||||
|
||||
@staticmethod
|
||||
def get_kv_cache_shape(
|
||||
num_blocks: int,
|
||||
block_size: int,
|
||||
num_kv_heads: int,
|
||||
head_size: int,
|
||||
) -> Tuple[int, ...]:
|
||||
return PagedAttention.get_kv_cache_shape(num_blocks, block_size,
|
||||
num_kv_heads, head_size)
|
||||
|
||||
@staticmethod
|
||||
def swap_blocks(
|
||||
src_kv_cache: torch.Tensor,
|
||||
dst_kv_cache: torch.Tensor,
|
||||
src_to_dst: Dict[int, int],
|
||||
) -> None:
|
||||
PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
|
||||
|
||||
@staticmethod
|
||||
def copy_blocks(
|
||||
kv_caches: List[torch.Tensor],
|
||||
src_to_dists: Dict[int, List[int]],
|
||||
) -> None:
|
||||
PagedAttention.copy_blocks(kv_caches, src_to_dists)
|
||||
|
||||
|
||||
@dataclass
|
||||
class BlocksparseFlashAttentionMetadata(AttentionMetadata):
|
||||
"""A copy of Metadata for FlashAttentionBackend,
|
||||
to avoid having to install flash_attn.
|
||||
|
||||
NOTE: Any python object stored here is not updated when it is
|
||||
cuda-graph replayed. If you have values that need to be changed
|
||||
dynamically, it should be stored in tensor. The tensor has to be
|
||||
updated from `CUDAGraphRunner.forward` API.
|
||||
"""
|
||||
# (batch_size,). The sequence length per sequence. Sequence length means
|
||||
# the computed tokens + new tokens None if it is a decoding.
|
||||
seq_lens: Optional[List[int]]
|
||||
# seq_lens stored as a tensor.
|
||||
seq_lens_tensor: Optional[torch.Tensor]
|
||||
|
||||
# NOTE(sang): Definition of context_len, query_len, and seq_len.
|
||||
# |---------- N-1 iteration --------|
|
||||
# |---------------- N iteration ---------------------|
|
||||
# |- tokenA -|......................|-- newTokens ---|
|
||||
# |---------- context_len ----------|
|
||||
# |-------------------- seq_len ----------------------|
|
||||
# |-- query_len ---|
|
||||
|
||||
# Maximum query length in the batch. None for decoding.
|
||||
max_query_len: Optional[int]
|
||||
# Maximum sequence length among prefill batch. 0 if there are decoding
|
||||
# requests only.
|
||||
max_prefill_seq_len: int
|
||||
# Maximum sequence length among decode batch. 0 if there are prefill
|
||||
# requests only.
|
||||
max_decode_seq_len: int
|
||||
# (batch_size + 1,). The cumulative subquery lengths of the sequences in
|
||||
# the batch, used to index into subquery. E.g., if the subquery length
|
||||
# is [4, 6], it is [0, 4, 10].
|
||||
query_start_loc: Optional[torch.Tensor]
|
||||
# (batch_size + 1,). The cumulative sequence lengths of the sequences in
|
||||
# the batch, used to index into sequence. E.g., if the sequence length is
|
||||
# [4, 6], it is [0, 4, 10].
|
||||
seq_start_loc: Optional[torch.Tensor]
|
||||
# (batch_size,) A tensor of context lengths (tokens that are computed
|
||||
# so far).
|
||||
context_lens_tensor: Optional[torch.Tensor]
|
||||
|
||||
# (batch_size, max_blocks_per_seq).
|
||||
# Block addresses per sequence. (Seq id -> list of physical block)
|
||||
# E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
|
||||
# in the kv cache. Each block can contain up to block_size tokens.
|
||||
# 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
|
||||
# captured.
|
||||
block_tables: Optional[torch.Tensor]
|
||||
|
||||
# Whether or not if cuda graph is enabled.
|
||||
# Cuda-graph is currently enabled for decoding only.
|
||||
# TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
|
||||
use_cuda_graph: bool
|
||||
|
||||
# Max number of query tokens for among request in the batch.
|
||||
max_decode_query_len: Optional[int] = None
|
||||
|
||||
_cached_prefill_metadata: Optional[
|
||||
"BlocksparseFlashAttentionMetadata"] = None
|
||||
_cached_decode_metadata: Optional[
|
||||
"BlocksparseFlashAttentionMetadata"] = None
|
||||
|
||||
block_tables_list: Optional[List[int]] = None
|
||||
|
||||
@property
|
||||
def prefill_metadata(
|
||||
self) -> Optional["BlocksparseFlashAttentionMetadata"]:
|
||||
if self.num_prefills == 0:
|
||||
return None
|
||||
|
||||
if self._cached_prefill_metadata is not None:
|
||||
return self._cached_prefill_metadata
|
||||
|
||||
assert self.seq_lens is not None
|
||||
assert self.seq_lens_tensor is not None
|
||||
assert self.query_start_loc is not None
|
||||
assert self.context_lens_tensor is not None
|
||||
assert self.block_tables is not None
|
||||
assert self.seq_start_loc is not None
|
||||
|
||||
self._cached_prefill_metadata = BlocksparseFlashAttentionMetadata(
|
||||
num_prefills=self.num_prefills,
|
||||
num_prefill_tokens=self.num_prefill_tokens,
|
||||
num_decode_tokens=0,
|
||||
slot_mapping=self.slot_mapping[:self.num_prefill_tokens],
|
||||
multi_modal_placeholder_index_maps=self.
|
||||
multi_modal_placeholder_index_maps,
|
||||
enable_kv_scales_calculation=self.enable_kv_scales_calculation,
|
||||
seq_lens=self.seq_lens[:self.num_prefills],
|
||||
seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills],
|
||||
max_query_len=self.max_query_len,
|
||||
max_prefill_seq_len=self.max_prefill_seq_len,
|
||||
max_decode_seq_len=0,
|
||||
query_start_loc=self.query_start_loc[:self.num_prefills + 1],
|
||||
seq_start_loc=self.seq_start_loc[:self.num_prefills + 1],
|
||||
context_lens_tensor=self.context_lens_tensor[:self.num_prefills],
|
||||
block_tables=self.block_tables[:self.num_prefills],
|
||||
use_cuda_graph=False,
|
||||
block_tables_list=self.block_tables_list
|
||||
)
|
||||
return self._cached_prefill_metadata
|
||||
|
||||
@property
|
||||
def decode_metadata(self) -> Optional["BlocksparseFlashAttentionMetadata"]:
|
||||
if self.num_decode_tokens == 0:
|
||||
return None
|
||||
|
||||
if self._cached_decode_metadata is not None:
|
||||
return self._cached_decode_metadata
|
||||
assert self.block_tables is not None
|
||||
assert self.seq_lens_tensor is not None
|
||||
|
||||
self._cached_decode_metadata = BlocksparseFlashAttentionMetadata(
|
||||
num_prefills=0,
|
||||
num_prefill_tokens=0,
|
||||
num_decode_tokens=self.num_decode_tokens,
|
||||
slot_mapping=self.slot_mapping[self.num_prefill_tokens:],
|
||||
multi_modal_placeholder_index_maps=None,
|
||||
enable_kv_scales_calculation=False,
|
||||
seq_lens=None,
|
||||
seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:],
|
||||
max_query_len=None,
|
||||
max_prefill_seq_len=0,
|
||||
max_decode_seq_len=self.max_decode_seq_len,
|
||||
query_start_loc=None,
|
||||
seq_start_loc=None,
|
||||
context_lens_tensor=None,
|
||||
block_tables=self.block_tables[self.num_prefills:],
|
||||
use_cuda_graph=self.use_cuda_graph,
|
||||
block_tables_list=self.block_tables_list
|
||||
)
|
||||
return self._cached_decode_metadata
|
||||
|
||||
|
||||
class BlocksparseFlashAttentionMetadataBuilder(
|
||||
CommonMetadataBuilder[BlocksparseFlashAttentionMetadata]):
|
||||
|
||||
_metadata_cls = BlocksparseFlashAttentionMetadata
|
||||
|
||||
|
||||
class BlocksparseFlashAttentionImpl(AttentionImpl):
|
||||
"""
|
||||
If the input tensors contain prompt tokens, the layout is as follows:
|
||||
|<--------------- num_prompt_tokens -------------->|
|
||||
|<--prompt_0-->|<--prompt_1-->|...|<--prompt_N-1-->|
|
||||
|
||||
Otherwise, the layout is as follows:
|
||||
|<------------------ num_generation_tokens (M) ----------------->|
|
||||
|<--generation_0-->|..........|<--generation_M-1-->|<--padding-->|
|
||||
|
||||
Generation tokens can contain padding when cuda-graph is used.
|
||||
Currently, prompt tokens don't contain any padding.
|
||||
|
||||
The prompts might have different lengths, while the generation tokens
|
||||
always have length 1.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
scale: float,
|
||||
num_kv_heads: int,
|
||||
alibi_slopes: Optional[List[float]],
|
||||
sliding_window: Optional[int],
|
||||
kv_cache_dtype: str,
|
||||
blocksparse_params: Optional[Dict[str, Any]] = None,
|
||||
logits_soft_cap: Optional[float] = None,
|
||||
attn_type: str = AttentionType.DECODER,
|
||||
kv_sharing_target_layer_name: Optional[str] = None,
|
||||
) -> None:
|
||||
if kv_sharing_target_layer_name is not None:
|
||||
raise NotImplementedError("KV sharing is not supported in V0.")
|
||||
assert blocksparse_params is not None
|
||||
assert alibi_slopes is None, ValueError(
|
||||
"Alibi not support for blocksparse flash attention.")
|
||||
assert sliding_window is None, ValueError(
|
||||
"sliding_window is invalid for blocksparse attention.")
|
||||
assert logits_soft_cap is None, ValueError(
|
||||
"logits_soft_cap is invalid for blocksparse attention.")
|
||||
|
||||
if "num_heads" not in blocksparse_params:
|
||||
blocksparse_params["num_heads"] = num_heads
|
||||
if "num_kv_heads" not in blocksparse_params:
|
||||
blocksparse_params["num_kv_heads"] = num_kv_heads or num_heads
|
||||
self.blocksparse_params = BlocksparseParams(**blocksparse_params)
|
||||
self.kv_cache_dtype = kv_cache_dtype
|
||||
|
||||
self.num_heads = num_heads
|
||||
self.head_size = head_size
|
||||
self.scale = float(scale)
|
||||
self.alibi_slopes = alibi_slopes
|
||||
self.num_kv_heads = num_kv_heads
|
||||
|
||||
self.num_queries_per_kv = self.num_heads // self.num_kv_heads
|
||||
|
||||
self.local_blocks = self.blocksparse_params.local_blocks
|
||||
self.vert_stride = self.blocksparse_params.vert_stride
|
||||
self.sparse_block_size = self.blocksparse_params.block_size
|
||||
self.head_sliding_step = self.blocksparse_params.head_sliding_step
|
||||
|
||||
supported_head_sizes = PagedAttention.get_supported_head_sizes()
|
||||
if head_size not in supported_head_sizes:
|
||||
raise ValueError(
|
||||
f"Head size {head_size} is not supported by PagedAttention. "
|
||||
f"Supported head sizes are: {supported_head_sizes}.")
|
||||
|
||||
self.tp_size = get_tensor_model_parallel_world_size()
|
||||
self.tp_rank = get_tensor_model_parallel_rank()
|
||||
|
||||
total_num_heads = num_heads * self.tp_size
|
||||
self.bs_attn = LocalStridedBlockSparseAttn(
|
||||
total_num_heads,
|
||||
self.blocksparse_params.max_seqlen,
|
||||
self.blocksparse_params.local_blocks,
|
||||
self.blocksparse_params.vert_stride,
|
||||
self.blocksparse_params.block_size,
|
||||
homo_head=self.blocksparse_params.homo_head,
|
||||
active_head_range=self.blocksparse_params.active_head_range,
|
||||
)
|
||||
|
||||
if attn_type != AttentionType.DECODER:
|
||||
raise NotImplementedError("Encoder self-attention and "
|
||||
"encoder/decoder cross-attention "
|
||||
"are not implemented for "
|
||||
"BlocksparseFlashAttentionImpl")
|
||||
|
||||
def forward(
|
||||
self,
|
||||
layer: AttentionLayer,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
kv_cache: torch.Tensor,
|
||||
attn_metadata: BlocksparseFlashAttentionMetadata,
|
||||
output: Optional[torch.Tensor] = None,
|
||||
output_scale: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
"""Forward pass with FlashAttention and PagedAttention.
|
||||
|
||||
Args:
|
||||
query: shape = [num_tokens, num_heads * head_size]
|
||||
key: shape = [num_tokens, num_kv_heads * head_size]
|
||||
value: shape = [num_tokens, num_kv_heads * head_size]
|
||||
kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
|
||||
NOTE: kv_cache will be an empty tensor with shape [0]
|
||||
for profiling run.
|
||||
attn_metadata: Metadata for attention.
|
||||
Returns:
|
||||
shape = [num_tokens, num_heads * head_size]
|
||||
"""
|
||||
if output_scale is not None:
|
||||
raise NotImplementedError(
|
||||
"fused output quantization is not yet supported"
|
||||
" for BlocksparseFlashAttentionImpl")
|
||||
|
||||
num_tokens, hidden_size = query.shape
|
||||
# Reshape the query, key, and value tensors.
|
||||
query = query.view(-1, self.num_heads, self.head_size)
|
||||
key = key.view(-1, self.num_kv_heads, self.head_size)
|
||||
value = value.view(-1, self.num_kv_heads, self.head_size)
|
||||
|
||||
if kv_cache.numel() > 0:
|
||||
key_cache, value_cache = PagedAttention.split_kv_cache(
|
||||
kv_cache, self.num_kv_heads, self.head_size)
|
||||
|
||||
# Reshape the input keys and values and store them in the cache.
|
||||
# If kv_cache is not provided, the new key and value tensors are
|
||||
# not cached. This happens during the initial memory profiling run.
|
||||
|
||||
PagedAttention.write_to_paged_cache(
|
||||
key,
|
||||
value,
|
||||
key_cache,
|
||||
value_cache,
|
||||
attn_metadata.slot_mapping,
|
||||
self.kv_cache_dtype,
|
||||
layer._k_scale,
|
||||
layer._v_scale,
|
||||
)
|
||||
|
||||
if prefill_meta := attn_metadata.prefill_metadata:
|
||||
|
||||
# Prompt run.
|
||||
# normal attention
|
||||
# When block_tables are not filled, it means q and k are the
|
||||
# prompt, and they have the same length.
|
||||
|
||||
assert kv_cache.numel() == 0 \
|
||||
or prefill_meta.block_tables is None \
|
||||
or prefill_meta.block_tables.numel() == 0, \
|
||||
"Does not support prefix-enabled attention."
|
||||
|
||||
output = self.bs_attn(
|
||||
q=query,
|
||||
k=key,
|
||||
v=value,
|
||||
cu_seqlens_q=prefill_meta.seq_start_loc,
|
||||
cu_seqlens_k=prefill_meta.seq_start_loc,
|
||||
sm_scale=self.scale,
|
||||
)
|
||||
|
||||
if decode_meta := attn_metadata.decode_metadata:
|
||||
# Decoding run.
|
||||
output = PagedAttention.forward_decode(
|
||||
query,
|
||||
key_cache,
|
||||
value_cache,
|
||||
decode_meta.block_tables,
|
||||
decode_meta.seq_lens_tensor,
|
||||
self.blocksparse_params.max_seqlen,
|
||||
self.kv_cache_dtype,
|
||||
self.num_kv_heads,
|
||||
self.scale,
|
||||
self.alibi_slopes,
|
||||
layer._k_scale,
|
||||
layer._v_scale,
|
||||
tp_rank=self.tp_rank,
|
||||
blocksparse_local_blocks=self.local_blocks,
|
||||
blocksparse_vert_stride=self.vert_stride,
|
||||
blocksparse_block_size=self.sparse_block_size,
|
||||
blocksparse_head_sliding_step=self.head_sliding_step,
|
||||
)
|
||||
|
||||
assert output is not None
|
||||
# Reshape the output tensor.
|
||||
return output.view(num_tokens, hidden_size)
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
307
vllm/attention/backends/cpu_mla.py
Normal file
307
vllm/attention/backends/cpu_mla.py
Normal file
@@ -0,0 +1,307 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional, Tuple, Type
|
||||
|
||||
import torch
|
||||
|
||||
import vllm._custom_ops as ops
|
||||
from vllm._ipex_ops import ipex_ops
|
||||
from vllm.attention.backends.abstract import (AttentionBackend,
|
||||
AttentionMetadataBuilder,
|
||||
AttentionType,
|
||||
is_quantized_kv_cache)
|
||||
from vllm.attention.backends.mla.common import MLACommonImpl, MLACommonState
|
||||
from vllm.attention.backends.torch_sdpa import TorchSDPAMetadata
|
||||
from vllm.utils import make_tensor_with_pad
|
||||
from vllm.worker.cpu_model_runner import ModelInputForCPUBuilder
|
||||
|
||||
|
||||
class CPUMLABackend(AttentionBackend):
|
||||
|
||||
@staticmethod
|
||||
def get_name() -> str:
|
||||
return "CPU_MLA"
|
||||
|
||||
@staticmethod
|
||||
def get_metadata_cls() -> Type["CPUMLAMetadata"]:
|
||||
return CPUMLAMetadata
|
||||
|
||||
@staticmethod
|
||||
def get_builder_cls() -> Type["CPUMLAMetadataBuilder"]:
|
||||
return CPUMLAMetadataBuilder
|
||||
|
||||
@staticmethod
|
||||
def get_state_cls() -> Type["MLACommonState"]:
|
||||
return MLACommonState
|
||||
|
||||
@staticmethod
|
||||
def get_impl_cls() -> Type["CPUMLAImpl"]:
|
||||
return CPUMLAImpl
|
||||
|
||||
@staticmethod
|
||||
def get_kv_cache_shape(
|
||||
num_blocks: int,
|
||||
block_size: int,
|
||||
num_kv_heads: int, # assumed to be 1 for MLA
|
||||
head_size: int,
|
||||
) -> Tuple[int, ...]:
|
||||
return (num_blocks, block_size, head_size)
|
||||
|
||||
@staticmethod
|
||||
def swap_blocks(
|
||||
src_kv_cache: torch.Tensor,
|
||||
dst_kv_cache: torch.Tensor,
|
||||
src_to_dst: torch.Tensor,
|
||||
) -> None:
|
||||
ops.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
|
||||
|
||||
@staticmethod
|
||||
def copy_blocks(
|
||||
kv_caches: List[torch.Tensor],
|
||||
src_to_dists: torch.Tensor,
|
||||
) -> None:
|
||||
ops.copy_blocks_mla(kv_caches, src_to_dists)
|
||||
|
||||
@staticmethod
|
||||
def get_supported_head_sizes() -> List[int]:
|
||||
return [576]
|
||||
|
||||
|
||||
@dataclass
|
||||
class CPUMLAMetadata(TorchSDPAMetadata):
|
||||
# New for MLA
|
||||
# Input positions for rotrary embeddings since for MLA the rotary
|
||||
# position embeddings are applied inside the attention backend
|
||||
input_positions: torch.Tensor = None
|
||||
|
||||
# required by MLACommonImpl
|
||||
is_profile_run: bool = False
|
||||
|
||||
|
||||
class CPUMLAMetadataBuilder(AttentionMetadataBuilder[CPUMLAMetadata]):
|
||||
|
||||
def __init__(self, input_builder: ModelInputForCPUBuilder) -> None:
|
||||
self.chunked_prefill = input_builder.chunked_prefill
|
||||
self.input_builder = input_builder
|
||||
assert not self.chunked_prefill, \
|
||||
"chunked prefill is currently not supported"
|
||||
|
||||
def prepare(self):
|
||||
self.input_data = self.input_builder.input_data
|
||||
|
||||
def build(self, seq_lens, query_lens, cuda_graph_pad_size, batch_size):
|
||||
input_data = self.input_data
|
||||
prefill_seq_lens = seq_lens[0:input_data.num_prefills]
|
||||
prefill_query_lens = query_lens[0:input_data.num_prefills]
|
||||
slot_mapping = torch.tensor(input_data.slot_mapping,
|
||||
dtype=torch.long,
|
||||
device="cpu")
|
||||
|
||||
# metadata for prefill
|
||||
if input_data.num_prefills > 0:
|
||||
query_lens_tensor = torch.tensor(prefill_query_lens,
|
||||
dtype=torch.int32,
|
||||
device="cpu")
|
||||
kv_lens_tensor = torch.tensor(prefill_seq_lens,
|
||||
dtype=torch.int32,
|
||||
device="cpu")
|
||||
query_start_loc = torch.zeros(input_data.num_prefills + 1,
|
||||
dtype=torch.int32,
|
||||
device="cpu")
|
||||
kv_start_loc = torch.zeros(input_data.num_prefills + 1,
|
||||
dtype=torch.int32,
|
||||
device="cpu")
|
||||
torch.cumsum(query_lens_tensor,
|
||||
dim=0,
|
||||
dtype=torch.int32,
|
||||
out=query_start_loc[1:])
|
||||
torch.cumsum(kv_lens_tensor,
|
||||
dim=0,
|
||||
dtype=torch.int32,
|
||||
out=kv_start_loc[1:])
|
||||
max_query_len = max(prefill_query_lens)
|
||||
max_kv_len = max(prefill_seq_lens)
|
||||
|
||||
# for chunked-prefill
|
||||
if self.chunked_prefill:
|
||||
prefill_block_tables = make_tensor_with_pad(
|
||||
self.input_data.prefill_block_tables,
|
||||
pad=0,
|
||||
dtype=torch.int32,
|
||||
device="cpu",
|
||||
)
|
||||
else:
|
||||
prefill_block_tables = None
|
||||
|
||||
else:
|
||||
query_start_loc = None
|
||||
kv_start_loc = None
|
||||
max_query_len = None
|
||||
max_kv_len = None
|
||||
prefill_block_tables = None
|
||||
|
||||
# metadata for decode
|
||||
if input_data.num_decode_tokens != 0:
|
||||
seq_lens_tensor = torch.tensor(
|
||||
input_data.seq_lens[input_data.num_prefills:],
|
||||
dtype=torch.int32,
|
||||
device="cpu",
|
||||
)
|
||||
block_tables = make_tensor_with_pad(
|
||||
self.input_data.decode_block_tables,
|
||||
pad=0,
|
||||
dtype=torch.int32,
|
||||
device="cpu",
|
||||
)
|
||||
else:
|
||||
block_tables = torch.tensor([])
|
||||
seq_lens_tensor = torch.tensor(
|
||||
input_data.seq_lens[:input_data.num_prefills],
|
||||
dtype=torch.int32,
|
||||
device="cpu",
|
||||
)
|
||||
|
||||
# For multi-modal models
|
||||
placeholder_index_maps = None
|
||||
if len(input_data.multi_modal_inputs_list) != 0:
|
||||
placeholder_index_maps = {
|
||||
modality: placeholder_map.index_map()
|
||||
for modality, placeholder_map in
|
||||
input_data.multi_modal_placeholder_maps.items()
|
||||
}
|
||||
|
||||
return CPUMLAMetadata(
|
||||
chunked_prefill=self.chunked_prefill,
|
||||
seq_lens=prefill_seq_lens,
|
||||
seq_lens_tensor=seq_lens_tensor,
|
||||
max_query_len=max_query_len,
|
||||
max_kv_len=max_kv_len,
|
||||
prefill_query_start_loc=query_start_loc,
|
||||
kv_start_loc=kv_start_loc,
|
||||
max_decode_seq_len=input_data.max_decode_seq_len,
|
||||
num_prefills=input_data.num_prefills,
|
||||
num_prefill_tokens=input_data.num_prefill_tokens,
|
||||
num_decode_tokens=input_data.num_decode_tokens,
|
||||
block_tables=block_tables,
|
||||
prefill_block_tables=prefill_block_tables,
|
||||
slot_mapping=slot_mapping,
|
||||
multi_modal_placeholder_index_maps=placeholder_index_maps,
|
||||
enable_kv_scales_calculation=False,
|
||||
input_positions=torch.tensor([self.input_data.input_positions]))
|
||||
|
||||
|
||||
class CPUMLAImpl(MLACommonImpl[CPUMLAMetadata]):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
scale: float,
|
||||
num_kv_heads: int,
|
||||
alibi_slopes: Optional[List[float]],
|
||||
sliding_window: Optional[int],
|
||||
kv_cache_dtype: str,
|
||||
blocksparse_params: Optional[Dict[str, Any]],
|
||||
logits_soft_cap: Optional[float],
|
||||
attn_type: str,
|
||||
kv_sharing_target_layer_name: Optional[str],
|
||||
# MLA Specific Arguments
|
||||
**mla_args) -> None:
|
||||
super().__init__(num_heads, head_size, scale, num_kv_heads,
|
||||
alibi_slopes, sliding_window, kv_cache_dtype,
|
||||
blocksparse_params, logits_soft_cap, attn_type,
|
||||
kv_sharing_target_layer_name, **mla_args)
|
||||
|
||||
unsupported_features = [
|
||||
alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
|
||||
]
|
||||
if any(unsupported_features):
|
||||
raise NotImplementedError(
|
||||
"CPUMLAImpl does not support one of the following: "
|
||||
"alibi_slopes, sliding_window, blocksparse_params, "
|
||||
"logits_soft_cap")
|
||||
|
||||
if attn_type != AttentionType.DECODER:
|
||||
raise NotImplementedError("Encoder self-attention and "
|
||||
"encoder/decoder cross-attention "
|
||||
"are not implemented for "
|
||||
"CPUMLAImpl")
|
||||
|
||||
# states is implemented.
|
||||
if is_quantized_kv_cache(self.kv_cache_dtype):
|
||||
raise NotImplementedError(
|
||||
"CPUMLAImpl with FP8 KV cache not yet supported")
|
||||
|
||||
def _forward_prefill(
|
||||
self,
|
||||
q: torch.Tensor,
|
||||
kv_c_normed: torch.Tensor,
|
||||
k_pe: torch.Tensor,
|
||||
kv_c_and_k_pe_cache: torch.Tensor,
|
||||
attn_metadata: CPUMLAMetadata, # type: ignore[override]
|
||||
) -> torch.Tensor:
|
||||
|
||||
prefill_metadata = attn_metadata.prefill_metadata
|
||||
assert prefill_metadata is not None
|
||||
|
||||
kv_nope = self.kv_b_proj(kv_c_normed)[0].view(\
|
||||
-1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
|
||||
k_nope, v = kv_nope\
|
||||
.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
|
||||
|
||||
k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
|
||||
|
||||
# For MLA the v head dim is smaller than qk head dim so we pad out
|
||||
# v with 0s to match the qk head dim
|
||||
v_padded = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
|
||||
value=0)
|
||||
|
||||
output = torch.empty_like(q)
|
||||
ipex_ops.varlen_attention(
|
||||
query=q,
|
||||
key=k,
|
||||
value=v_padded,
|
||||
out=output,
|
||||
seqlen_q=prefill_metadata.prefill_query_start_loc,
|
||||
seqlen_k=prefill_metadata.prefill_query_start_loc,
|
||||
max_seqlen_q=prefill_metadata.max_query_len,
|
||||
max_seqlen_k=prefill_metadata.max_query_len,
|
||||
pdropout=0.0,
|
||||
softmax_scale=self.scale,
|
||||
zero_tensors=False,
|
||||
is_causal=True,
|
||||
return_softmax=False,
|
||||
gen_=None,
|
||||
logits_soft_cap=0.0,
|
||||
window_size_left=-1,
|
||||
window_size_right=-1,
|
||||
alibi_slopes=None,
|
||||
)
|
||||
|
||||
# remove padding
|
||||
output = output.view(-1, self.num_heads,
|
||||
q.shape[-1])[..., :v.shape[-1]]
|
||||
return output.reshape(-1, self.num_heads * v.shape[-1])
|
||||
|
||||
def _forward_decode(
|
||||
self,
|
||||
q_nope: torch.Tensor,
|
||||
q_pe: torch.Tensor,
|
||||
kv_c_and_k_pe_cache: torch.Tensor,
|
||||
attn_metadata: CPUMLAMetadata, # type: ignore[override]
|
||||
) -> torch.Tensor:
|
||||
assert kv_c_and_k_pe_cache.numel() > 0
|
||||
|
||||
decode_meta = attn_metadata.decode_metadata
|
||||
assert decode_meta is not None
|
||||
|
||||
q = torch.cat([q_nope, q_pe], dim=-1)
|
||||
o = q.new_empty(q.shape[0], self.num_heads, self.kv_lora_rank)
|
||||
|
||||
# Run MQA
|
||||
ops.mla_decode_kvcache_cpu(o, q, kv_c_and_k_pe_cache, self.scale,
|
||||
decode_meta.block_tables,
|
||||
decode_meta.seq_lens_tensor)
|
||||
return self._v_up_proj(o)
|
||||
1530
vllm/attention/backends/dual_chunk_flash_attn.py
Normal file
1530
vllm/attention/backends/dual_chunk_flash_attn.py
Normal file
File diff suppressed because it is too large
Load Diff
1084
vllm/attention/backends/flash_attn.py
Normal file
1084
vllm/attention/backends/flash_attn.py
Normal file
File diff suppressed because it is too large
Load Diff
1109
vllm/attention/backends/flashinfer.py
Normal file
1109
vllm/attention/backends/flashinfer.py
Normal file
File diff suppressed because it is too large
Load Diff
249
vllm/attention/backends/flashmla.py
Normal file
249
vllm/attention/backends/flashmla.py
Normal file
@@ -0,0 +1,249 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.attention.backends.abstract import (AttentionType,
|
||||
is_quantized_kv_cache)
|
||||
from vllm.attention.backends.mla.common import (MLACommonBackend,
|
||||
MLACommonImpl,
|
||||
MLACommonMetadata,
|
||||
MLACommonMetadataBuilder,
|
||||
MLACommonState)
|
||||
from vllm.attention.ops.flashmla import (flash_mla_with_kvcache,
|
||||
get_mla_metadata,
|
||||
is_flashmla_supported)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
|
||||
|
||||
|
||||
class FlashMLABackend(MLACommonBackend):
|
||||
|
||||
@staticmethod
|
||||
def get_name() -> str:
|
||||
return "FLASHMLA"
|
||||
|
||||
@staticmethod
|
||||
def get_impl_cls() -> Type["FlashMLAImpl"]:
|
||||
return FlashMLAImpl
|
||||
|
||||
@staticmethod
|
||||
def get_metadata_cls() -> Type["FlashMLAMetadata"]:
|
||||
return FlashMLAMetadata
|
||||
|
||||
@staticmethod
|
||||
def get_builder_cls() -> Type["FlashMLAMetadataBuilder"]:
|
||||
return FlashMLAMetadataBuilder
|
||||
|
||||
@staticmethod
|
||||
def get_state_cls() -> Type["FlashMLAState"]:
|
||||
return FlashMLAState
|
||||
|
||||
|
||||
@dataclass
|
||||
class FlashMLAMetadata(MLACommonMetadata):
|
||||
decode_tile_scheduler_metadata: Optional[Tuple[torch.Tensor,
|
||||
torch.Tensor]] = None
|
||||
decode_num_splits: Optional[torch.Tensor] = None
|
||||
|
||||
@property
|
||||
def decode_metadata(self):
|
||||
decode_metadata = super().decode_metadata
|
||||
# TODO: cache assignment?
|
||||
if decode_metadata is not None:
|
||||
decode_metadata.decode_tile_scheduler_metadata=\
|
||||
self.decode_tile_scheduler_metadata
|
||||
decode_metadata.decode_num_splits=\
|
||||
self.decode_num_splits
|
||||
return decode_metadata
|
||||
|
||||
def advance_step(self,
|
||||
model_input: "ModelInputForGPUWithSamplingMetadata",
|
||||
sampled_token_ids: Optional[torch.Tensor],
|
||||
block_size: int,
|
||||
num_seqs: int,
|
||||
num_queries: int,
|
||||
turn_prefills_into_decodes: bool = False):
|
||||
raise NotImplementedError(
|
||||
"advance_step is not implemented for FlashMLA")
|
||||
|
||||
|
||||
class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
self.num_q_heads = self.runner.model_config.get_num_attention_heads(
|
||||
self.runner.parallel_config)
|
||||
|
||||
def build(self, seq_lens: List[int], query_lens: List[int],
|
||||
cuda_graph_pad_size: int, batch_size: int):
|
||||
m = super().build(seq_lens, query_lens, cuda_graph_pad_size,
|
||||
batch_size)
|
||||
|
||||
if m.num_decode_tokens > 0:
|
||||
m.decode_tile_scheduler_metadata, m.decode_num_splits = \
|
||||
get_mla_metadata(
|
||||
m.seq_lens_tensor[m.num_prefills:],
|
||||
self.num_q_heads,
|
||||
1, # MQA for the decode path
|
||||
)
|
||||
|
||||
return m
|
||||
|
||||
|
||||
class FlashMLAState(MLACommonState[FlashMLAMetadata]):
|
||||
|
||||
def __init__(self, *args, **kwds):
|
||||
super().__init__(*args, **kwds)
|
||||
|
||||
self.num_q_heads = self.runner.model_config.get_num_attention_heads(
|
||||
self.runner.parallel_config)
|
||||
|
||||
@contextmanager
|
||||
def graph_capture(self, max_batch_size: int):
|
||||
# Run a dummy `get_mla_metadata` so we can get the right shapes
|
||||
self._graph_decoder_tile_scheduler_metadata, \
|
||||
self._graph_decode_num_splits = get_mla_metadata(
|
||||
torch.ones(
|
||||
max_batch_size, dtype=torch.int32, device=self.runner.device),
|
||||
self.num_q_heads,
|
||||
1, # MQA for the decode path
|
||||
)
|
||||
|
||||
with super().graph_capture(max_batch_size):
|
||||
yield
|
||||
|
||||
del self._graph_decoder_tile_scheduler_metadata
|
||||
del self._graph_decode_num_splits
|
||||
|
||||
def graph_capture_get_metadata_for_batch(
|
||||
self, batch_size: int, is_encoder_decoder_model: bool = False):
|
||||
metadata = super().graph_capture_get_metadata_for_batch(
|
||||
batch_size, is_encoder_decoder_model)
|
||||
assert metadata.num_decode_tokens > 0
|
||||
|
||||
decoder_tile_scheduler_metadata, decode_num_splits = get_mla_metadata(
|
||||
self._graph_seq_lens[:batch_size],
|
||||
self.num_q_heads,
|
||||
1, # MQA for the decode path
|
||||
)
|
||||
|
||||
self._graph_decoder_tile_scheduler_metadata.copy_(
|
||||
decoder_tile_scheduler_metadata)
|
||||
self._graph_decode_num_splits[:batch_size + 1].copy_(decode_num_splits)
|
||||
|
||||
metadata.decode_tile_scheduler_metadata=\
|
||||
self._graph_decoder_tile_scheduler_metadata
|
||||
metadata.decode_num_splits=\
|
||||
self._graph_decode_num_splits[:batch_size + 1]
|
||||
|
||||
return metadata
|
||||
|
||||
def get_graph_input_buffers(self,
|
||||
attn_metadata,
|
||||
is_encoder_decoder_model: bool = False):
|
||||
input_buffers = super().get_graph_input_buffers(
|
||||
attn_metadata, is_encoder_decoder_model)
|
||||
input_buffers["decode_tile_scheduler_metadata"] = \
|
||||
attn_metadata.decode_metadata.decode_tile_scheduler_metadata
|
||||
input_buffers["decode_num_splits"] = \
|
||||
attn_metadata.decode_metadata.decode_num_splits
|
||||
|
||||
return input_buffers
|
||||
|
||||
def prepare_graph_input_buffers(self,
|
||||
input_buffers,
|
||||
attn_metadata,
|
||||
is_encoder_decoder_model: bool = False):
|
||||
super().prepare_graph_input_buffers(input_buffers, attn_metadata,
|
||||
is_encoder_decoder_model)
|
||||
|
||||
input_buffers["decode_tile_scheduler_metadata"].copy_(
|
||||
attn_metadata.decode_metadata.decode_tile_scheduler_metadata)
|
||||
input_buffers["decode_num_splits"].copy_(
|
||||
attn_metadata.decode_metadata.decode_num_splits)
|
||||
|
||||
|
||||
class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
scale: float,
|
||||
num_kv_heads: int,
|
||||
alibi_slopes: Optional[List[float]],
|
||||
sliding_window: Optional[int],
|
||||
kv_cache_dtype: str,
|
||||
blocksparse_params: Optional[Dict[str, Any]],
|
||||
logits_soft_cap: Optional[float],
|
||||
attn_type: str,
|
||||
kv_sharing_target_layer_name: Optional[str] = None,
|
||||
# MLA Specific Arguments
|
||||
**mla_args) -> None:
|
||||
super().__init__(num_heads, head_size, scale, num_kv_heads,
|
||||
alibi_slopes, sliding_window, kv_cache_dtype,
|
||||
blocksparse_params, logits_soft_cap, attn_type,
|
||||
kv_sharing_target_layer_name, **mla_args)
|
||||
|
||||
assert is_flashmla_supported(), \
|
||||
"FlashMLA is not supported on this device"
|
||||
|
||||
unsupported_features = [
|
||||
alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
|
||||
]
|
||||
if any(unsupported_features):
|
||||
raise NotImplementedError(
|
||||
"FlashMLAImpl does not support one of the following: "
|
||||
"alibi_slopes, sliding_window, blocksparse_params, "
|
||||
"logits_soft_cap")
|
||||
|
||||
if attn_type != AttentionType.DECODER:
|
||||
raise NotImplementedError("Encoder self-attention and "
|
||||
"encoder/decoder cross-attention "
|
||||
"are not implemented for "
|
||||
"FlashMLAImpl")
|
||||
|
||||
if is_quantized_kv_cache(self.kv_cache_dtype):
|
||||
if self.kv_cache_dtype != "fp8":
|
||||
raise NotImplementedError(
|
||||
"FlashMLA with other KV cache not yet supported")
|
||||
|
||||
def _forward_decode(
|
||||
self,
|
||||
q_nope: torch.Tensor,
|
||||
q_pe: torch.Tensor,
|
||||
kv_c_and_k_pe_cache: torch.Tensor,
|
||||
attn_metadata: FlashMLAMetadata,
|
||||
k_scale = None,
|
||||
kv_cache_dtype = "auto",
|
||||
) -> torch.Tensor:
|
||||
assert kv_c_and_k_pe_cache.numel() > 0
|
||||
|
||||
decode_meta = attn_metadata.decode_metadata
|
||||
assert decode_meta is not None
|
||||
|
||||
q = torch.cat([q_nope, q_pe], dim=-1)\
|
||||
.unsqueeze(1) # Add seqlen dim of 1 (decode)
|
||||
|
||||
o, _ = flash_mla_with_kvcache(
|
||||
q=q,
|
||||
k_cache=kv_c_and_k_pe_cache.unsqueeze(-2), # Add head dim of 1
|
||||
block_table=decode_meta.block_tables,
|
||||
cache_seqlens=decode_meta.seq_lens_tensor,
|
||||
head_dim_v=self.kv_lora_rank,
|
||||
tile_scheduler_metadata=decode_meta.decode_tile_scheduler_metadata,
|
||||
num_splits=decode_meta.decode_num_splits,
|
||||
softmax_scale=self.scale,
|
||||
causal=True,
|
||||
k_scale = k_scale,
|
||||
kv_cache_dtype = kv_cache_dtype,
|
||||
)
|
||||
|
||||
return self._v_up_proj(o)
|
||||
318
vllm/attention/backends/hpu_attn.py
Normal file
318
vllm/attention/backends/hpu_attn.py
Normal file
@@ -0,0 +1,318 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
###############################################################################
|
||||
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
|
||||
###############################################################################
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional, Tuple, Type
|
||||
|
||||
import torch
|
||||
import vllm_hpu_extension.kernels as kernels
|
||||
import vllm_hpu_extension.ops as ops
|
||||
from vllm_hpu_extension.flags import enabled_flags
|
||||
from vllm_hpu_extension.utils import Matmul, Softmax, VLLMKVCache
|
||||
|
||||
from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
|
||||
AttentionLayer,
|
||||
AttentionMetadata, AttentionType,
|
||||
is_quantized_kv_cache)
|
||||
from vllm.attention.backends.utils import CommonAttentionState
|
||||
from vllm.attention.ops.hpu_paged_attn import (HPUPagedAttention,
|
||||
HPUPagedAttentionMetadata)
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class HPUAttentionBackend(AttentionBackend):
|
||||
|
||||
@staticmethod
|
||||
def get_name() -> str:
|
||||
return "HPU_ATTN"
|
||||
|
||||
@staticmethod
|
||||
def get_impl_cls() -> Type["HPUAttentionImpl"]:
|
||||
return HPUAttentionImpl
|
||||
|
||||
@staticmethod
|
||||
def get_metadata_cls() -> Type["AttentionMetadata"]:
|
||||
return HPUAttentionMetadata
|
||||
|
||||
@staticmethod
|
||||
def get_state_cls() -> Type["CommonAttentionState"]:
|
||||
return CommonAttentionState
|
||||
|
||||
@staticmethod
|
||||
def get_kv_cache_shape(
|
||||
num_blocks: int,
|
||||
block_size: int,
|
||||
num_kv_heads: int,
|
||||
head_size: int,
|
||||
) -> Tuple[int, ...]:
|
||||
return HPUPagedAttention.get_kv_cache_shape(num_blocks, block_size,
|
||||
num_kv_heads, head_size)
|
||||
|
||||
@staticmethod
|
||||
def swap_blocks(
|
||||
src_kv_cache: torch.Tensor,
|
||||
dst_kv_cache: torch.Tensor,
|
||||
src_to_dsts: torch.Tensor,
|
||||
) -> None:
|
||||
HPUPagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dsts)
|
||||
|
||||
@staticmethod
|
||||
def copy_blocks(
|
||||
kv_caches: List[torch.Tensor],
|
||||
src_to_dsts: torch.Tensor,
|
||||
) -> None:
|
||||
HPUPagedAttention.copy_blocks(kv_caches, src_to_dsts)
|
||||
|
||||
|
||||
@dataclass
|
||||
class HPUAttentionMetadata(HPUPagedAttentionMetadata, AttentionMetadata):
|
||||
"""Metadata for HPUAttentionbackend."""
|
||||
# Currently, input sequences can only contain all prompts
|
||||
# or all decoding. True if all sequences are prompts.
|
||||
is_prompt: bool
|
||||
attn_bias: Optional[torch.Tensor]
|
||||
seq_lens_tensor: Optional[torch.Tensor]
|
||||
context_lens_tensor: Optional[torch.Tensor]
|
||||
|
||||
|
||||
class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
|
||||
"""
|
||||
If the input tensors contain prompt tokens, the layout is as follows:
|
||||
|<--------------- num_prefill_tokens ----------------->|
|
||||
|<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->|
|
||||
|
||||
Otherwise, the layout is as follows:
|
||||
|<----------------- num_decode_tokens ------------------>|
|
||||
|<--decode_0-->|..........|<--decode_M-1-->|<--padding-->|
|
||||
|
||||
Generation tokens can contain padding when cuda-graph is used.
|
||||
Currently, prompt tokens don't contain any padding.
|
||||
|
||||
The prompts might have different lengths, while the generation tokens
|
||||
always have length 1.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
scale: float,
|
||||
num_kv_heads: int,
|
||||
alibi_slopes: Optional[List[float]],
|
||||
sliding_window: Optional[int],
|
||||
kv_cache_dtype: str,
|
||||
blocksparse_params: Optional[Dict[str, Any]] = None,
|
||||
max_seq_len: int = 4096,
|
||||
attn_type: str = AttentionType.DECODER,
|
||||
kv_sharing_target_layer_name: Optional[str] = None,
|
||||
use_irope: bool = False,
|
||||
) -> None:
|
||||
super(AttentionImpl, self).__init__()
|
||||
if kv_sharing_target_layer_name is not None:
|
||||
raise NotImplementedError("KV sharing is not supported in V0.")
|
||||
if use_irope:
|
||||
logger.warning_once(
|
||||
"Using irope in HPU is not supported yet, it will fall back "
|
||||
"to global attention for long context.")
|
||||
self.kv_cache_dtype = kv_cache_dtype
|
||||
self.num_heads = num_heads
|
||||
self.head_size = head_size
|
||||
self.scale = float(scale)
|
||||
self.matmul_qk = Matmul()
|
||||
self.softmax = Softmax()
|
||||
self.matmul_av = Matmul()
|
||||
self.batch2block_matmul = Matmul()
|
||||
self.block2batch_matmul = Matmul()
|
||||
self.k_cache = VLLMKVCache()
|
||||
self.v_cache = VLLMKVCache()
|
||||
self.fused_scaled_dot_product_attention = kernels.fsdpa()
|
||||
|
||||
self.prefill_impl = 'naive'
|
||||
if "flex_attention" in enabled_flags():
|
||||
self.prefill_impl = 'flex'
|
||||
if "fsdpa" in enabled_flags():
|
||||
assert alibi_slopes is None, \
|
||||
'Prefill with FusedSDPA not supported with alibi slopes!'
|
||||
self.prefill_impl = 'fsdpa'
|
||||
|
||||
self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
|
||||
self.sliding_window = sliding_window
|
||||
self.alibi_slopes = alibi_slopes
|
||||
if alibi_slopes is not None:
|
||||
alibi_slopes_tensor = torch.tensor(alibi_slopes,
|
||||
dtype=torch.bfloat16)
|
||||
self.alibi_slopes = alibi_slopes_tensor
|
||||
self.num_queries_per_kv = self.num_heads // self.num_kv_heads
|
||||
|
||||
if self.prefill_impl == 'fsdpa':
|
||||
assert alibi_slopes is None, \
|
||||
'Prefill with FusedSDPA not supported with alibi slopes!'
|
||||
|
||||
supported_head_sizes = HPUPagedAttention.get_supported_head_sizes()
|
||||
if head_size not in supported_head_sizes:
|
||||
raise ValueError(
|
||||
f"Head size {head_size} is not supported by PagedAttention. "
|
||||
f"Supported head sizes are: {supported_head_sizes}.")
|
||||
|
||||
self.attn_type = attn_type
|
||||
if self.attn_type != AttentionType.DECODER:
|
||||
raise NotImplementedError("Encoder self-attention and "
|
||||
"encoder/decoder cross-attention "
|
||||
"are not implemented for "
|
||||
"HPUAttentionImpl")
|
||||
|
||||
if is_quantized_kv_cache(self.kv_cache_dtype):
|
||||
raise NotImplementedError(
|
||||
"HPUAttention with FP8 KV cache not yet supported")
|
||||
|
||||
def forward(
|
||||
self,
|
||||
layer: AttentionLayer,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
kv_cache: torch.Tensor,
|
||||
attn_metadata: HPUAttentionMetadata,
|
||||
output: Optional[torch.Tensor] = None,
|
||||
output_scale: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
"""Forward pass with xFormers and PagedAttention.
|
||||
|
||||
Args:
|
||||
query: shape = [num_tokens, num_heads * head_size]
|
||||
key: shape = [num_tokens, num_kv_heads * head_size]
|
||||
value: shape = [num_tokens, num_kv_heads * head_size]
|
||||
kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
|
||||
attn_metadata: Metadata for attention.
|
||||
Returns:
|
||||
shape = [num_tokens, num_heads * head_size]
|
||||
"""
|
||||
if output_scale is not None:
|
||||
raise NotImplementedError(
|
||||
"fused output quantization is not yet supported"
|
||||
" for HPUAttentionImpl")
|
||||
|
||||
batch_size, seq_len, hidden_size = query.shape
|
||||
_, seq_len_kv, _ = key.shape
|
||||
|
||||
key = key.view(-1, self.num_kv_heads, self.head_size)
|
||||
value = value.view(-1, self.num_kv_heads, self.head_size)
|
||||
block_indices = attn_metadata.block_indices
|
||||
block_offsets = attn_metadata.block_offsets
|
||||
key_cache = None
|
||||
value_cache = None
|
||||
if attn_metadata.is_prompt and self.attn_type \
|
||||
is not AttentionType.ENCODER_ONLY:
|
||||
key = key.unflatten(0, (block_indices.size(0), -1))
|
||||
value = value.unflatten(0, (block_indices.size(0), -1))
|
||||
if kv_cache is not None and isinstance(kv_cache, tuple):
|
||||
key_cache, value_cache = HPUPagedAttention.split_kv_cache(
|
||||
kv_cache, self.num_kv_heads, self.head_size)
|
||||
|
||||
# Reshape the input keys and values and store them in the cache.
|
||||
# If kv_cache is not provided, the new key and value tensors are
|
||||
# not cached. This happens during the initial memory profiling run.
|
||||
key_cache = self.k_cache(key, key_cache, block_indices,
|
||||
block_offsets)
|
||||
value_cache = self.v_cache(value, value_cache, block_indices,
|
||||
block_offsets)
|
||||
|
||||
if attn_metadata.is_prompt:
|
||||
# Prompt run.
|
||||
query_shape = (batch_size, seq_len, self.num_heads, self.head_size)
|
||||
kv_shape = (batch_size, seq_len_kv, self.num_kv_heads,
|
||||
self.head_size)
|
||||
|
||||
attn_bias = attn_metadata.attn_bias
|
||||
if attn_bias is not None and self.alibi_slopes is not None:
|
||||
position_bias = _make_alibi_bias(self.alibi_slopes,
|
||||
self.num_kv_heads,
|
||||
attn_bias.dtype,
|
||||
attn_bias.shape[-1])
|
||||
attn_bias = attn_bias.tile((1, self.num_kv_heads, 1, 1))
|
||||
attn_bias.add_(position_bias)
|
||||
|
||||
block_list = attn_metadata.block_list if attn_metadata \
|
||||
and attn_metadata.block_list is not None else None
|
||||
|
||||
out = ops.prompt_attention(
|
||||
impl=self.prefill_impl,
|
||||
query=query.view(query_shape),
|
||||
key=key.view(kv_shape),
|
||||
value=value.view(kv_shape),
|
||||
is_causal=True,
|
||||
attn_bias=attn_bias,
|
||||
valid_seq_lengths=attn_metadata.seq_lens_tensor,
|
||||
**self.common_attention_args(block_list, key_cache,
|
||||
value_cache))
|
||||
output = out.reshape(batch_size, seq_len, hidden_size)
|
||||
else:
|
||||
# Decoding run.
|
||||
output = HPUPagedAttention.forward_decode(
|
||||
query=query,
|
||||
block_mapping=attn_metadata.block_mapping,
|
||||
block_bias=attn_metadata.attn_bias,
|
||||
block_groups=attn_metadata.block_groups,
|
||||
**self.common_attention_args(attn_metadata.block_list,
|
||||
key_cache, value_cache))
|
||||
# Reshape the output tensor.
|
||||
return output.view(batch_size, seq_len, hidden_size)
|
||||
|
||||
def common_attention_args(self,
|
||||
block_list=None,
|
||||
key_cache=None,
|
||||
value_cache=None):
|
||||
fsdpa_op = self.fused_scaled_dot_product_attention.apply \
|
||||
if self.fused_scaled_dot_product_attention is not None else None
|
||||
return {
|
||||
'scale': self.scale,
|
||||
'matmul_qk_op': self.matmul_qk,
|
||||
'matmul_av_op': self.matmul_av,
|
||||
'batch2block_matmul_op': self.batch2block_matmul,
|
||||
'block2batch_matmul_op': self.block2batch_matmul,
|
||||
'fsdpa_op': fsdpa_op,
|
||||
'keys_fetch_func': self.k_cache.fetch_from_cache,
|
||||
'values_fetch_func': self.v_cache.fetch_from_cache,
|
||||
'softmax_op': self.softmax,
|
||||
'block_list': block_list,
|
||||
'key_cache': key_cache,
|
||||
'value_cache': value_cache,
|
||||
}
|
||||
|
||||
|
||||
def _make_alibi_bias(
|
||||
alibi_slopes: torch.Tensor,
|
||||
num_kv_heads: int,
|
||||
dtype: torch.dtype,
|
||||
seq_len: int,
|
||||
) -> torch.Tensor:
|
||||
bias = torch.arange(seq_len, dtype=dtype)
|
||||
# NOTE(zhuohan): HF uses
|
||||
# `bias = bias[None, :].repeat(seq_len, 1)`
|
||||
# here. We find that both biases give the same results, but
|
||||
# the bias below more accurately follows the original ALiBi
|
||||
# paper.
|
||||
# Calculate a matrix where each element represents ith element- jth
|
||||
# element.
|
||||
bias = bias[None, :] - bias[:, None]
|
||||
|
||||
padded_len = (seq_len + 7) // 8 * 8
|
||||
num_heads = alibi_slopes.shape[0]
|
||||
bias = torch.empty(
|
||||
1, # batch size
|
||||
num_heads,
|
||||
seq_len,
|
||||
padded_len,
|
||||
device=alibi_slopes.device,
|
||||
dtype=dtype,
|
||||
)[:, :, :, :seq_len].copy_(bias)
|
||||
bias.mul_(alibi_slopes[:, None, None])
|
||||
if num_heads != num_kv_heads:
|
||||
bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads))
|
||||
return bias
|
||||
403
vllm/attention/backends/ipex_attn.py
Normal file
403
vllm/attention/backends/ipex_attn.py
Normal file
@@ -0,0 +1,403 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
""" Attention layer with torch scaled_dot_product_attention
|
||||
and PagedAttention."""
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional, Tuple, Type
|
||||
|
||||
import torch
|
||||
|
||||
from vllm._ipex_ops import ipex_ops
|
||||
from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
|
||||
AttentionLayer,
|
||||
AttentionMetadata, AttentionType,
|
||||
is_quantized_kv_cache)
|
||||
from vllm.attention.backends.utils import CommonAttentionState
|
||||
from vllm.attention.ops.paged_attn import (PagedAttention,
|
||||
PagedAttentionMetadata)
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
_PARTITION_SIZE = 512
|
||||
|
||||
|
||||
class IpexAttnBackend(AttentionBackend):
|
||||
|
||||
@staticmethod
|
||||
def get_name() -> str:
|
||||
return "IPEX"
|
||||
|
||||
@staticmethod
|
||||
def get_impl_cls() -> Type["IpexAttnBackendImpl"]:
|
||||
return IpexAttnBackendImpl
|
||||
|
||||
@staticmethod
|
||||
def get_metadata_cls() -> Type["IpexAttnMetadata"]:
|
||||
return IpexAttnMetadata
|
||||
|
||||
@staticmethod
|
||||
def get_state_cls() -> Type["CommonAttentionState"]:
|
||||
return CommonAttentionState
|
||||
|
||||
@staticmethod
|
||||
def get_kv_cache_shape(
|
||||
num_blocks: int,
|
||||
block_size: int,
|
||||
num_kv_heads: int,
|
||||
head_size: int,
|
||||
) -> Tuple[int, ...]:
|
||||
return PagedAttention.get_kv_cache_shape(num_blocks, block_size,
|
||||
num_kv_heads, head_size)
|
||||
|
||||
@staticmethod
|
||||
def swap_blocks(
|
||||
src_kv_cache: torch.Tensor,
|
||||
dst_kv_cache: torch.Tensor,
|
||||
src_to_dst: torch.Tensor,
|
||||
) -> None:
|
||||
from vllm._ipex_ops import ipex_ops as ops
|
||||
ops.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
|
||||
|
||||
@staticmethod
|
||||
def copy_blocks(
|
||||
kv_caches: List[torch.Tensor],
|
||||
src_to_dists: torch.Tensor,
|
||||
) -> None:
|
||||
from vllm._ipex_ops import ipex_ops as ops
|
||||
key_caches = [kv_cache[0] for kv_cache in kv_caches]
|
||||
value_caches = [kv_cache[1] for kv_cache in kv_caches]
|
||||
ops.copy_blocks(key_caches, value_caches, src_to_dists)
|
||||
|
||||
|
||||
@dataclass
|
||||
class IpexAttnMetadata(AttentionMetadata, PagedAttentionMetadata):
|
||||
"""Metadata for IpexAttnBackend.
|
||||
"""
|
||||
# Currently, input sequences can only contain all prompts
|
||||
# or all decoding. True if all sequences are prompts.
|
||||
is_prompt: bool
|
||||
slot_mapping: torch.Tensor
|
||||
seq_lens: Optional[List[int]]
|
||||
seqlen_q: Optional[torch.Tensor]
|
||||
max_seqlen: Optional[int]
|
||||
|
||||
def __post_init__(self):
|
||||
# Set during the execution of the first attention op.
|
||||
# It is a list because it is needed to set per prompt
|
||||
# when alibi slopes is used. It is because of the limitation
|
||||
# from xformer API.
|
||||
# will not appear in the __repr__ and __init__
|
||||
self.attn_bias: Optional[List[torch.Tensor]] = None
|
||||
|
||||
@property
|
||||
def prefill_metadata(self) -> Optional["IpexAttnMetadata"]:
|
||||
# Currently chunked prefill is not supported
|
||||
if self.num_decode_tokens == 0:
|
||||
assert self.num_prefills > 0
|
||||
return self
|
||||
|
||||
return None
|
||||
|
||||
@property
|
||||
def decode_metadata(self) -> Optional["IpexAttnMetadata"]:
|
||||
# Currently chunked prefill is not supported
|
||||
if self.num_prefills > 0:
|
||||
assert self.num_decode_tokens == 0
|
||||
return None
|
||||
|
||||
return self
|
||||
|
||||
|
||||
class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
scale: float,
|
||||
num_kv_heads: int,
|
||||
alibi_slopes: Optional[List[float]],
|
||||
sliding_window: Optional[int],
|
||||
kv_cache_dtype: str,
|
||||
blocksparse_params: Optional[Dict[str, Any]] = None,
|
||||
logits_soft_cap: Optional[float] = None,
|
||||
attn_type: str = AttentionType.DECODER,
|
||||
kv_sharing_target_layer_name: Optional[str] = None,
|
||||
use_irope: bool = False,
|
||||
) -> None:
|
||||
if kv_sharing_target_layer_name is not None:
|
||||
raise NotImplementedError("KV sharing is not supported in V0.")
|
||||
if use_irope:
|
||||
logger.warning_once(
|
||||
"Using irope in Ipex is not supported yet, it will fall"
|
||||
" back to global attention for long context.")
|
||||
if blocksparse_params is not None:
|
||||
raise ValueError(
|
||||
"IPEX backend does not support block-sparse attention.")
|
||||
self.num_heads = num_heads
|
||||
self.head_size = head_size
|
||||
self.scale = float(scale)
|
||||
self.num_kv_heads = num_kv_heads
|
||||
if alibi_slopes is not None:
|
||||
alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
|
||||
self.alibi_slopes = alibi_slopes
|
||||
self.sliding_window = sliding_window
|
||||
self.kv_cache_dtype = kv_cache_dtype
|
||||
|
||||
self.num_queries_per_kv = self.num_heads // self.num_kv_heads
|
||||
self.need_mask = (self.sliding_window is not None)
|
||||
if logits_soft_cap is None:
|
||||
logits_soft_cap = -1
|
||||
self.logits_soft_cap = logits_soft_cap
|
||||
|
||||
supported_head_sizes = PagedAttention.get_supported_head_sizes()
|
||||
if head_size not in supported_head_sizes:
|
||||
raise ValueError(
|
||||
f"Head size {head_size} is not supported by PagedAttention. "
|
||||
f"Supported head sizes are: {supported_head_sizes}.")
|
||||
if is_quantized_kv_cache(kv_cache_dtype):
|
||||
raise NotImplementedError(
|
||||
"IPEX backend does not support FP8 KV cache. "
|
||||
"Please use xFormers backend instead.")
|
||||
if attn_type != AttentionType.DECODER:
|
||||
raise NotImplementedError("Encoder self-attention and "
|
||||
"encoder/decoder cross-attention "
|
||||
"are not implemented for "
|
||||
"IpexAttnBackendImpl")
|
||||
|
||||
def split_kv_cache(
|
||||
self,
|
||||
kv_cache: torch.Tensor,
|
||||
num_kv_heads: int,
|
||||
head_size: int,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
x = 1
|
||||
num_blocks = kv_cache.shape[1]
|
||||
|
||||
key_cache = kv_cache[0]
|
||||
key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x,
|
||||
-1, x)
|
||||
value_cache = kv_cache[1]
|
||||
value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1)
|
||||
return key_cache, value_cache
|
||||
|
||||
def forward(
|
||||
self,
|
||||
layer: AttentionLayer,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
kv_cache: torch.Tensor,
|
||||
attn_metadata: IpexAttnMetadata, # type: ignore
|
||||
output: Optional[torch.Tensor] = None,
|
||||
output_scale: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
"""Forward pass with IPEX varlen_attention and PagedAttention.
|
||||
|
||||
Args:
|
||||
query: shape = [num_tokens, num_heads * head_size]
|
||||
key: shape = [num_tokens, num_kv_heads * head_size]
|
||||
value: shape = [num_tokens, num_kv_heads * head_size]
|
||||
kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
|
||||
NOTE: kv_cache will be an empty tensor with shape [0]
|
||||
for profiling run.
|
||||
attn_metadata: Metadata for attention.
|
||||
Returns:
|
||||
shape = [num_tokens, num_heads * head_size]
|
||||
"""
|
||||
if output_scale is not None:
|
||||
raise NotImplementedError(
|
||||
"fused output quantization is not yet supported"
|
||||
" for IpexAttentionImpl")
|
||||
|
||||
assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0
|
||||
num_tokens, hidden_size = query.shape
|
||||
# Reshape the query, key, and value tensors.
|
||||
query = query.view(-1, self.num_heads, self.head_size)
|
||||
key = key.view(-1, self.num_kv_heads, self.head_size)
|
||||
value = value.view(-1, self.num_kv_heads, self.head_size)
|
||||
|
||||
if kv_cache.numel() > 0:
|
||||
key_cache, value_cache = self.split_kv_cache(
|
||||
kv_cache, self.num_kv_heads, self.head_size)
|
||||
ipex_ops.reshape_and_cache(
|
||||
key,
|
||||
value,
|
||||
key_cache,
|
||||
value_cache,
|
||||
attn_metadata.slot_mapping.flatten(),
|
||||
self.kv_cache_dtype,
|
||||
layer._k_scale_float,
|
||||
layer._v_scale_float,
|
||||
)
|
||||
|
||||
if attn_metadata.is_prompt:
|
||||
assert attn_metadata.seq_lens is not None
|
||||
if (kv_cache.numel() == 0
|
||||
or attn_metadata.block_tables.numel() == 0):
|
||||
if self.num_kv_heads != self.num_heads:
|
||||
key = key.repeat_interleave(self.num_queries_per_kv, dim=1)
|
||||
value = value.repeat_interleave(self.num_queries_per_kv,
|
||||
dim=1)
|
||||
|
||||
if attn_metadata.attn_bias is None:
|
||||
if self.sliding_window is not None:
|
||||
att_masks = _make_sliding_window_bias(
|
||||
attn_metadata.seq_lens, self.sliding_window,
|
||||
query.dtype) # type: ignore
|
||||
else:
|
||||
att_masks = _make_sliding_window_bias(
|
||||
attn_metadata.seq_lens, None, dtype=query.dtype)
|
||||
attn_metadata.attn_bias = att_masks
|
||||
|
||||
output = torch.empty(
|
||||
(num_tokens, self.num_heads, self.head_size),
|
||||
dtype=query.dtype,
|
||||
device=query.device)
|
||||
ipex_ops.varlen_attention(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
output,
|
||||
attn_metadata.seqlen_q,
|
||||
attn_metadata.seqlen_q,
|
||||
self.alibi_slopes,
|
||||
attn_metadata.max_seqlen,
|
||||
attn_metadata.max_seqlen,
|
||||
pdropout=0.0,
|
||||
softmax_scale=self.scale,
|
||||
zero_tensors=False,
|
||||
is_causal=True,
|
||||
return_softmax=False,
|
||||
gen_=None,
|
||||
window_size_left=-1,
|
||||
window_size_right=-1,
|
||||
logits_soft_cap=self.logits_soft_cap,
|
||||
)
|
||||
else:
|
||||
# prefix-enabled attention
|
||||
raise RuntimeError(
|
||||
"IPEX backend doesn't support prefix decoding.")
|
||||
|
||||
else:
|
||||
# Decoding run.
|
||||
max_seq_len = attn_metadata.max_decode_seq_len
|
||||
output = torch.empty_like(query)
|
||||
block_size = value_cache.shape[3]
|
||||
num_seqs, num_heads, head_size = query.shape
|
||||
max_num_partitions = ((max_seq_len + _PARTITION_SIZE - 1) //
|
||||
_PARTITION_SIZE)
|
||||
# NOTE(woosuk): We use a simple heuristic to decide whether to use
|
||||
# PagedAttention V1 or V2. If the number of partitions is 1, we use
|
||||
# V1 to avoid the overhead of reduction. Also, if the number of
|
||||
# sequences or heads is large, we use V1 since there is enough work
|
||||
# to parallelize.
|
||||
# TODO(woosuk): Tune this heuristic.
|
||||
# For context len > 8192, use V2 kernel to avoid shared memory
|
||||
# shortage.
|
||||
use_v1 = (max_seq_len <= 8192 and
|
||||
(max_num_partitions == 1 or num_seqs * num_heads > 512))
|
||||
if use_v1:
|
||||
# Run PagedAttention V1.
|
||||
ipex_ops.paged_attention_v1(
|
||||
output,
|
||||
query,
|
||||
key_cache,
|
||||
value_cache,
|
||||
self.num_kv_heads,
|
||||
self.scale,
|
||||
attn_metadata.block_tables,
|
||||
attn_metadata.seq_lens_tensor,
|
||||
block_size,
|
||||
max_seq_len,
|
||||
self.alibi_slopes,
|
||||
self.kv_cache_dtype,
|
||||
layer._k_scale_float,
|
||||
layer._v_scale_float,
|
||||
)
|
||||
else:
|
||||
# Run PagedAttention V2.
|
||||
assert _PARTITION_SIZE % block_size == 0
|
||||
tmp_output = torch.empty(
|
||||
size=(num_seqs, num_heads, max_num_partitions, head_size),
|
||||
dtype=output.dtype,
|
||||
device=output.device,
|
||||
)
|
||||
exp_sums = torch.empty(
|
||||
size=(num_seqs, num_heads, max_num_partitions),
|
||||
dtype=torch.float32,
|
||||
device=output.device,
|
||||
)
|
||||
max_logits = torch.empty_like(exp_sums)
|
||||
ipex_ops.paged_attention_v2(
|
||||
output,
|
||||
exp_sums,
|
||||
max_logits,
|
||||
tmp_output,
|
||||
query,
|
||||
key_cache,
|
||||
value_cache,
|
||||
self.num_kv_heads,
|
||||
self.scale,
|
||||
attn_metadata.block_tables,
|
||||
attn_metadata.seq_lens_tensor,
|
||||
block_size,
|
||||
max_seq_len,
|
||||
self.alibi_slopes,
|
||||
self.kv_cache_dtype,
|
||||
layer._k_scale_float,
|
||||
layer._v_scale_float,
|
||||
)
|
||||
|
||||
# Reshape the output tensor.
|
||||
return output.view(-1, self.num_heads * self.head_size)
|
||||
|
||||
|
||||
def _make_alibi_bias(
|
||||
alibi_slopes: torch.Tensor,
|
||||
dtype: torch.dtype,
|
||||
seq_lens: List[int],
|
||||
) -> List[torch.Tensor]:
|
||||
attn_biases = []
|
||||
for seq_len in seq_lens:
|
||||
bias = torch.arange(seq_len, dtype=dtype, device=alibi_slopes.device)
|
||||
# NOTE(zhuohan): HF uses
|
||||
# `bias = bias[None, :].repeat(seq_len, 1)`
|
||||
# here. We find that both biases give the same results, but
|
||||
# the bias below more accurately follows the original ALiBi
|
||||
# paper.
|
||||
bias = bias[None, :] - bias[:, None]
|
||||
|
||||
num_heads = alibi_slopes.shape[0]
|
||||
bias = bias[None, :].repeat((num_heads, 1, 1))
|
||||
bias.mul_(alibi_slopes[:, None, None])
|
||||
inf_mask = torch.empty(
|
||||
(1, seq_len, seq_len),
|
||||
dtype=bias.dtype,
|
||||
device=alibi_slopes.device).fill_(-torch.inf).triu_(diagonal=1)
|
||||
attn_biases.append((bias + inf_mask).to(dtype))
|
||||
|
||||
return attn_biases
|
||||
|
||||
|
||||
def _make_sliding_window_bias(
|
||||
seq_lens: List[int],
|
||||
window_size: Optional[int],
|
||||
dtype: torch.dtype,
|
||||
) -> List[torch.Tensor]:
|
||||
attn_biases = []
|
||||
for seq_len in seq_lens:
|
||||
tensor = torch.full(
|
||||
(1, seq_len, seq_len),
|
||||
dtype=dtype,
|
||||
fill_value=1,
|
||||
)
|
||||
shift = 0
|
||||
mask = torch.tril(tensor, diagonal=shift).to(dtype) # type: ignore
|
||||
if window_size is not None:
|
||||
mask = torch.triu(mask, diagonal=shift - window_size + 1)
|
||||
mask = torch.log(mask)
|
||||
attn_biases.append(mask.to(dtype))
|
||||
|
||||
return attn_biases
|
||||
0
vllm/attention/backends/mla/__init__.py
Normal file
0
vllm/attention/backends/mla/__init__.py
Normal file
1405
vllm/attention/backends/mla/common.py
Normal file
1405
vllm/attention/backends/mla/common.py
Normal file
File diff suppressed because it is too large
Load Diff
356
vllm/attention/backends/pallas.py
Normal file
356
vllm/attention/backends/pallas.py
Normal file
@@ -0,0 +1,356 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional, Tuple, Type
|
||||
|
||||
import torch
|
||||
import torch_xla.experimental.custom_kernel # Required to register custom ops.
|
||||
|
||||
from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
|
||||
AttentionLayer,
|
||||
AttentionMetadata, AttentionType,
|
||||
is_quantized_kv_cache)
|
||||
from vllm.attention.backends.utils import CommonAttentionState
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class PallasAttentionBackend(AttentionBackend):
|
||||
|
||||
@staticmethod
|
||||
def get_name() -> str:
|
||||
return "PALLAS"
|
||||
|
||||
@staticmethod
|
||||
def get_impl_cls() -> Type["PallasAttentionBackendImpl"]:
|
||||
return PallasAttentionBackendImpl
|
||||
|
||||
@staticmethod
|
||||
def get_metadata_cls() -> Type["PallasMetadata"]:
|
||||
return PallasMetadata
|
||||
|
||||
@staticmethod
|
||||
def get_state_cls() -> Type["CommonAttentionState"]:
|
||||
return CommonAttentionState
|
||||
|
||||
@staticmethod
|
||||
def get_kv_cache_shape(
|
||||
num_blocks: int,
|
||||
block_size: int,
|
||||
num_kv_heads: int,
|
||||
head_size: int,
|
||||
) -> Tuple[int, ...]:
|
||||
return (num_kv_heads, num_blocks, block_size, head_size)
|
||||
|
||||
@staticmethod
|
||||
def swap_blocks(
|
||||
src_kv_cache: torch.Tensor,
|
||||
dst_kv_cache: torch.Tensor,
|
||||
src_to_dst: torch.Tensor,
|
||||
) -> None:
|
||||
raise RuntimeError("swap_blocks is not used for the TPU backend.")
|
||||
|
||||
@torch.compile(backend="openxla")
|
||||
@staticmethod
|
||||
def copy_blocks(
|
||||
kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
|
||||
src_to_dists: Tuple[torch.Tensor, torch.Tensor],
|
||||
) -> None:
|
||||
src_indices, dst_indices = src_to_dists
|
||||
for k_cache, v_cache in kv_caches:
|
||||
torch.ops.xla.dynamo_set_buffer_donor_(k_cache, True)
|
||||
k_cache[:, dst_indices] = k_cache[:, src_indices]
|
||||
torch.ops.xla.dynamo_set_buffer_donor_(v_cache, True)
|
||||
v_cache[:, dst_indices] = v_cache[:, src_indices]
|
||||
|
||||
|
||||
@dataclass
|
||||
class PallasMetadata(AttentionMetadata):
|
||||
|
||||
# Currently, input sequences can only contain all prefills
|
||||
# or all decoding.
|
||||
block_tables: Optional[torch.Tensor] = None
|
||||
context_lens: Optional[torch.Tensor] = None
|
||||
effective_query_lens: Optional[torch.Tensor] = None
|
||||
|
||||
@property
|
||||
def prefill_metadata(self) -> Optional["PallasMetadata"]:
|
||||
if self.num_prefills == 0:
|
||||
return None
|
||||
|
||||
assert self.num_decode_tokens == 0
|
||||
return self
|
||||
|
||||
@property
|
||||
def decode_metadata(self) -> Optional["PallasMetadata"]:
|
||||
if self.num_decode_tokens == 0:
|
||||
return None
|
||||
|
||||
assert self.num_prefills == 0
|
||||
assert self.num_prefill_tokens == 0
|
||||
assert self.block_tables is not None
|
||||
assert self.context_lens is not None
|
||||
return self
|
||||
|
||||
|
||||
class PallasAttentionBackendImpl(AttentionImpl):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
scale: float,
|
||||
num_kv_heads: int,
|
||||
alibi_slopes: Optional[List[float]],
|
||||
sliding_window: Optional[int],
|
||||
kv_cache_dtype: str,
|
||||
blocksparse_params: Optional[Dict[str, Any]] = None,
|
||||
logits_soft_cap: Optional[float] = None,
|
||||
attn_type: str = AttentionType.DECODER,
|
||||
kv_sharing_target_layer_name: Optional[str] = None,
|
||||
use_irope: bool = False,
|
||||
) -> None:
|
||||
if kv_sharing_target_layer_name is not None:
|
||||
raise NotImplementedError("KV sharing is not supported in V0.")
|
||||
if use_irope:
|
||||
logger.warning_once(
|
||||
"Using irope in Pallas is not supported yet, it will fall back "
|
||||
"to global attention for long context.")
|
||||
self.num_heads = num_heads
|
||||
self.head_size = head_size
|
||||
self.scale = float(scale)
|
||||
self.num_kv_heads = num_kv_heads
|
||||
|
||||
self.num_queries_per_kv = self.num_heads // self.num_kv_heads
|
||||
self.logits_soft_cap = logits_soft_cap
|
||||
if head_size % 128 != 0:
|
||||
raise NotImplementedError(
|
||||
f"Head size must be a multiple of 128, found {head_size}.")
|
||||
if alibi_slopes is not None:
|
||||
raise NotImplementedError("Alibi slopes is not supported.")
|
||||
if sliding_window is not None:
|
||||
raise NotImplementedError("Sliding window is not supported.")
|
||||
if is_quantized_kv_cache(kv_cache_dtype):
|
||||
raise NotImplementedError("FP8 KV cache dtype is not supported.")
|
||||
if blocksparse_params is not None:
|
||||
raise NotImplementedError("Blocksparse is not supported.")
|
||||
|
||||
if torch_xla.tpu.version() < 4:
|
||||
raise NotImplementedError("TPU version must be 4 or higher.")
|
||||
|
||||
self.megacore_mode = None
|
||||
tpu_env = torch_xla.tpu.get_tpu_env()
|
||||
tpu_type = (tpu_env.get("ACCELERATOR_TYPE", None)
|
||||
or tpu_env.get("TYPE", None)
|
||||
or tpu_env.get("TPU_ACCELERATOR_TYPE", None))
|
||||
assert tpu_type is not None
|
||||
tpu_type = tpu_type.lower()
|
||||
|
||||
if (("lite" not in tpu_type) and ("v6" not in tpu_type)):
|
||||
if self.num_kv_heads % 2 == 0:
|
||||
self.megacore_mode = "kv_head"
|
||||
else:
|
||||
# NOTE(woosuk): If the batch size is not a multiple of 2, the
|
||||
# megacore mode will be None.
|
||||
self.megacore_mode = "batch"
|
||||
|
||||
if attn_type != AttentionType.DECODER:
|
||||
raise NotImplementedError("Encoder self-attention and "
|
||||
"encoder/decoder cross-attention "
|
||||
"are not implemented for "
|
||||
"PallasAttentionBackendImpl")
|
||||
|
||||
def forward(
|
||||
self,
|
||||
layer: AttentionLayer,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
kv_cache: Tuple[torch.Tensor, torch.Tensor],
|
||||
attn_metadata: PallasMetadata,
|
||||
output: Optional[torch.Tensor] = None,
|
||||
output_scale: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
"""Forward pass with Pallas attention.
|
||||
|
||||
Args:
|
||||
query: shape = [batch_size, seq_len, num_heads * head_size]
|
||||
key: shape = [batch_size, seq_len, num_kv_heads * head_size]
|
||||
value: shape = [batch_size, seq_len, num_kv_heads * head_size]
|
||||
kv_cache[0] = [num_kv_heads, num_blocks, block_size, head_size]
|
||||
kv_cache[1] = [num_kv_heads, num_blocks, block_size, head_size]
|
||||
NOTE: kv_cache[0] and kv_cache[1] will be an empty tensor
|
||||
with shape [0] for profiling run.
|
||||
attn_metadata: Metadata for attention.
|
||||
Returns:
|
||||
shape = [batch_size, seq_len, num_heads * head_size]
|
||||
"""
|
||||
if output_scale is not None:
|
||||
raise NotImplementedError(
|
||||
"fused output quantization is not yet supported"
|
||||
" for PallasAttentionImpl")
|
||||
|
||||
assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0
|
||||
batch_size, seq_len, hidden_size = query.shape
|
||||
query = query.view(batch_size, seq_len, self.num_heads, self.head_size)
|
||||
key = key.view(batch_size, seq_len, self.num_kv_heads, self.head_size)
|
||||
value = value.view(batch_size, seq_len, self.num_kv_heads,
|
||||
self.head_size)
|
||||
|
||||
if kv_cache[0].numel() > 0:
|
||||
slot_mapping = attn_metadata.slot_mapping
|
||||
key_cache, value_cache = kv_cache
|
||||
write_to_kv_cache(key, value, key_cache, value_cache, slot_mapping)
|
||||
|
||||
query = query * self.scale
|
||||
if attn_metadata.num_prefills > 0:
|
||||
if attn_metadata.block_tables is None:
|
||||
# Prefill without paged KV cache.
|
||||
assert seq_len % 16 == 0, (
|
||||
"Pallas FlashAttention kernel requires seq_len to be a "
|
||||
f"multiple of 16 but got {seq_len}")
|
||||
|
||||
# Handle GQA/MQA.
|
||||
if self.num_kv_heads != self.num_heads:
|
||||
key = key.repeat_interleave(self.num_queries_per_kv,
|
||||
dim=-2)
|
||||
key = key.view(batch_size, seq_len, self.num_heads,
|
||||
self.head_size)
|
||||
value = value.repeat_interleave(self.num_queries_per_kv,
|
||||
dim=-2)
|
||||
value = value.view(batch_size, seq_len, self.num_heads,
|
||||
self.head_size)
|
||||
# FlashAttention kernel requires the input shape to be
|
||||
# [batch_size, num_heads, seq_len, d_model]
|
||||
# while the input is [batch_size, seq_len, num_heads, d_model].
|
||||
# Permute the input to match the required format.
|
||||
output = torch.ops.xla.flash_attention(
|
||||
query.permute(0, 2, 1, 3),
|
||||
key.permute(0, 2, 1, 3),
|
||||
value.permute(0, 2, 1, 3),
|
||||
True,
|
||||
)
|
||||
output = output.permute(0, 2, 1, 3)
|
||||
else:
|
||||
# Prefill with paged KV cache.
|
||||
# TODO(woosuk): Tune the below knobs.
|
||||
num_kv_pages_per_compute_block = 16
|
||||
num_queries_per_compute_block = 16
|
||||
assert seq_len % num_queries_per_compute_block == 0
|
||||
output = torch.ops.xla.multi_queries_paged_attention(
|
||||
query,
|
||||
key_cache,
|
||||
value_cache,
|
||||
attn_metadata.context_lens,
|
||||
attn_metadata.block_tables,
|
||||
attn_metadata.effective_query_lens,
|
||||
num_kv_pages_per_compute_block,
|
||||
num_queries_per_compute_block,
|
||||
use_kernel=True,
|
||||
attn_logits_soft_cap=self.logits_soft_cap,
|
||||
)
|
||||
else:
|
||||
# Decoding run.
|
||||
assert kv_cache[0].numel() > 0
|
||||
query = query.squeeze(dim=1)
|
||||
pages_per_compute_block = 16 # TODO(woosuk): Tune this value.
|
||||
|
||||
assert attn_metadata.block_tables is not None
|
||||
assert attn_metadata.context_lens is not None
|
||||
# NOTE(woosuk): The PagedAttention Pallas kernel stores the entire
|
||||
# block table in SMEM. Therefore, if the block table is too large,
|
||||
# the kernel compilation will fail. To avoid this, we split the
|
||||
# batch dimension into smaller chunks and run the kernel multiple
|
||||
# times.
|
||||
MAX_SMEM_USAGE = 512 * 1024
|
||||
size_per_seq = 4 * attn_metadata.block_tables.shape[1]
|
||||
max_num_seq = MAX_SMEM_USAGE // size_per_seq
|
||||
|
||||
if batch_size <= max_num_seq:
|
||||
output = paged_attention(
|
||||
query,
|
||||
key_cache,
|
||||
value_cache,
|
||||
attn_metadata.context_lens,
|
||||
attn_metadata.block_tables,
|
||||
pages_per_compute_block,
|
||||
self.megacore_mode,
|
||||
attn_logits_soft_cap=self.logits_soft_cap,
|
||||
)
|
||||
else:
|
||||
chunk_size = max_num_seq
|
||||
# Make sure the chunk size is a multiple of 2.
|
||||
chunk_size = chunk_size // 2 * 2
|
||||
num_chunks = (batch_size + chunk_size - 1) // chunk_size
|
||||
|
||||
output = torch.empty_like(query)
|
||||
for chunk_idx in range(num_chunks):
|
||||
chunk_start = chunk_idx * chunk_size
|
||||
chunk_end = chunk_start + chunk_size
|
||||
# NOTE(woosuk): We skip this line because it causes Dynamo
|
||||
# compilation error. Instead, we rely on the slice operation
|
||||
# to handle the out-of-bound case.
|
||||
# chunk_end = min(chunk_end, batch_size)
|
||||
chunk_output = paged_attention(
|
||||
query[chunk_start:chunk_end],
|
||||
key_cache,
|
||||
value_cache,
|
||||
attn_metadata.context_lens[chunk_start:chunk_end],
|
||||
attn_metadata.block_tables[chunk_start:chunk_end],
|
||||
pages_per_compute_block,
|
||||
self.megacore_mode,
|
||||
attn_logits_soft_cap=self.logits_soft_cap,
|
||||
)
|
||||
output[chunk_start:chunk_end] = chunk_output
|
||||
|
||||
# Reshape the output tensor.
|
||||
return output.reshape(batch_size, seq_len, hidden_size)
|
||||
|
||||
|
||||
def write_to_kv_cache(
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
key_cache: torch.Tensor,
|
||||
value_cache: torch.Tensor,
|
||||
slot_mapping: torch.Tensor,
|
||||
) -> None:
|
||||
torch.ops.xla.dynamo_set_buffer_donor_(key_cache, True)
|
||||
torch.ops.xla.dynamo_set_buffer_donor_(value_cache, True)
|
||||
|
||||
key = key.flatten(0, 2)
|
||||
value = value.flatten(0, 2)
|
||||
key_cache = key_cache.flatten(0, 2)
|
||||
value_cache = value_cache.flatten(0, 2)
|
||||
key_cache.index_copy_(0, slot_mapping, key)
|
||||
value_cache.index_copy_(0, slot_mapping, value)
|
||||
|
||||
|
||||
def paged_attention(
|
||||
query: torch.Tensor,
|
||||
key_cache: torch.Tensor,
|
||||
value_cache: torch.Tensor,
|
||||
context_lens: torch.Tensor,
|
||||
block_tables: torch.Tensor,
|
||||
pages_per_compute_block: int,
|
||||
megacore_mode: Optional[str],
|
||||
*,
|
||||
attn_logits_soft_cap: Optional[float],
|
||||
) -> torch.Tensor:
|
||||
batch_size = query.shape[0]
|
||||
if megacore_mode == "batch" and batch_size % 2 != 0:
|
||||
megacore_mode = None
|
||||
else:
|
||||
megacore_mode = megacore_mode
|
||||
|
||||
return torch.ops.xla.paged_attention(
|
||||
query,
|
||||
key_cache,
|
||||
value_cache,
|
||||
context_lens,
|
||||
block_tables,
|
||||
pages_per_compute_block,
|
||||
megacore_mode=megacore_mode,
|
||||
attn_logits_soft_cap=attn_logits_soft_cap,
|
||||
)
|
||||
400
vllm/attention/backends/placeholder_attn.py
Normal file
400
vllm/attention/backends/placeholder_attn.py
Normal file
@@ -0,0 +1,400 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass
|
||||
from itertools import accumulate
|
||||
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
|
||||
AttentionMetadata,
|
||||
AttentionMetadataBuilder)
|
||||
from vllm.attention.backends.utils import CommonAttentionState
|
||||
from vllm.multimodal import MultiModalPlaceholderMap
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.worker.model_runner import (ModelInputForGPUBuilder,
|
||||
ModelInputForGPUWithSamplingMetadata)
|
||||
from vllm.utils import async_tensor_h2d
|
||||
|
||||
# Placeholder attention backend for models like Mamba and pooling models that
|
||||
# lack attention.
|
||||
|
||||
|
||||
class PlaceholderAttentionBackend(AttentionBackend):
|
||||
"""Placeholder backend for when no attention is needed."""
|
||||
|
||||
@staticmethod
|
||||
def get_name() -> str:
|
||||
return "NO_ATTENTION"
|
||||
|
||||
@staticmethod
|
||||
def get_impl_cls() -> Type["PlaceholderAttentionImpl"]:
|
||||
return PlaceholderAttentionImpl
|
||||
|
||||
@staticmethod
|
||||
def get_builder_cls() -> Type["PlaceholderAttentionMetadataBuilder"]:
|
||||
return PlaceholderAttentionMetadataBuilder
|
||||
|
||||
@staticmethod
|
||||
def get_metadata_cls() -> Type["PlaceholderAttentionMetadata"]:
|
||||
return PlaceholderAttentionMetadata
|
||||
|
||||
@staticmethod
|
||||
def get_state_cls() -> Type["CommonAttentionState"]:
|
||||
return CommonAttentionState
|
||||
|
||||
@staticmethod
|
||||
def get_kv_cache_shape(
|
||||
num_blocks: int,
|
||||
block_size: int,
|
||||
num_kv_heads: int,
|
||||
head_size: int,
|
||||
) -> Tuple[int, ...]:
|
||||
return (1, 1, 1, 1, 1)
|
||||
|
||||
@staticmethod
|
||||
def swap_blocks(
|
||||
src_kv_cache: torch.Tensor,
|
||||
dst_kv_cache: torch.Tensor,
|
||||
src_to_dst: torch.Tensor,
|
||||
) -> None:
|
||||
return
|
||||
|
||||
@staticmethod
|
||||
def copy_blocks(
|
||||
kv_caches: List[torch.Tensor],
|
||||
src_to_dists: torch.Tensor,
|
||||
) -> None:
|
||||
return
|
||||
|
||||
|
||||
@dataclass
|
||||
class PlaceholderAttentionMetadata(AttentionMetadata):
|
||||
"""Attention metadata for prefill and decode batched together."""
|
||||
# (batch_size,). The sequence length per sequence. Sequence length means
|
||||
# the computed tokens + new tokens None if it is a decoding.
|
||||
seq_lens: Optional[List[int]]
|
||||
# seq_lens stored as a tensor.
|
||||
seq_lens_tensor: Optional[torch.Tensor]
|
||||
|
||||
# Maximum sequence length among prefill batch. 0 if there are decoding
|
||||
# requests only.
|
||||
max_prefill_seq_len: int
|
||||
# Maximum sequence length among decode batch. 0 if there are prefill
|
||||
# requests only.
|
||||
max_decode_seq_len: int
|
||||
# (batch_size,) A tensor of context lengths (tokens that are computed
|
||||
# so far).
|
||||
context_lens_tensor: Optional[torch.Tensor]
|
||||
|
||||
# Whether or not if cuda graph is enabled.
|
||||
# Cuda-graph is currently enabled for decoding only.
|
||||
# TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
|
||||
use_cuda_graph: bool
|
||||
|
||||
# Maximum query length in the batch.
|
||||
max_query_len: Optional[int]
|
||||
|
||||
# Max number of query tokens among request in the batch.
|
||||
max_decode_query_len: Optional[int]
|
||||
|
||||
# (batch_size + 1,). The cumulative subquery lengths of the sequences in
|
||||
# the batch, used to index into subquery. E.g., if the subquery length
|
||||
# is [4, 6], it is [0, 4, 10].
|
||||
query_start_loc: Optional[torch.Tensor] = None
|
||||
# (batch_size + 1,). The cumulative sequence lengths of the sequences in
|
||||
# the batch, used to index into sequence. E.g., if the sequence length is
|
||||
# [4, 6], it is [0, 4, 10].
|
||||
seq_start_loc: Optional[torch.Tensor] = None
|
||||
|
||||
# Placeholder.
|
||||
block_tables: Optional[torch.Tensor] = None
|
||||
|
||||
_cached_prefill_metadata: Optional["PlaceholderAttentionMetadata"] = None
|
||||
_cached_decode_metadata: Optional["PlaceholderAttentionMetadata"] = None
|
||||
|
||||
@property
|
||||
def prefill_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
|
||||
if self.num_prefills == 0:
|
||||
return None
|
||||
|
||||
if self._cached_prefill_metadata is not None:
|
||||
return self._cached_prefill_metadata
|
||||
|
||||
# Compute some attn_metadata fields which default to None
|
||||
query_start_loc = (None if self.query_start_loc is None else
|
||||
self.query_start_loc[:self.num_prefills + 1])
|
||||
seq_lens = (None if self.seq_lens is None else
|
||||
self.seq_lens[:self.num_prefills])
|
||||
seq_lens_tensor = (None if self.seq_lens_tensor is None else
|
||||
self.seq_lens_tensor[:self.num_prefills])
|
||||
seq_start_loc = (None if self.seq_start_loc is None else
|
||||
self.seq_start_loc[:self.num_prefills + 1])
|
||||
context_lens_tensor = (None if self.context_lens_tensor is None else
|
||||
self.context_lens_tensor[:self.num_prefills])
|
||||
|
||||
# Placeholders
|
||||
slot_mapping = torch.empty(0)
|
||||
block_tables = torch.empty(0)
|
||||
|
||||
self._cached_prefill_metadata = PlaceholderAttentionMetadata(
|
||||
num_prefills=self.num_prefills,
|
||||
num_prefill_tokens=self.num_prefill_tokens,
|
||||
num_decode_tokens=0,
|
||||
slot_mapping=slot_mapping,
|
||||
multi_modal_placeholder_index_maps=self.
|
||||
multi_modal_placeholder_index_maps,
|
||||
enable_kv_scales_calculation=self.enable_kv_scales_calculation,
|
||||
seq_lens=seq_lens,
|
||||
seq_lens_tensor=seq_lens_tensor,
|
||||
max_decode_query_len=0,
|
||||
max_query_len=self.max_query_len,
|
||||
max_prefill_seq_len=self.max_prefill_seq_len,
|
||||
max_decode_seq_len=0,
|
||||
query_start_loc=query_start_loc,
|
||||
seq_start_loc=seq_start_loc,
|
||||
context_lens_tensor=context_lens_tensor,
|
||||
block_tables=block_tables,
|
||||
use_cuda_graph=False,
|
||||
)
|
||||
return self._cached_prefill_metadata
|
||||
|
||||
@property
|
||||
def decode_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
|
||||
if self.num_decode_tokens == 0:
|
||||
return None
|
||||
|
||||
if self._cached_decode_metadata is not None:
|
||||
return self._cached_decode_metadata
|
||||
assert self.seq_lens_tensor is not None
|
||||
|
||||
# Placeholders
|
||||
slot_mapping = torch.empty(0)
|
||||
block_tables = torch.empty(0)
|
||||
seq_lens_tensor = (None if self.seq_lens_tensor is None else
|
||||
self.seq_lens_tensor[self.num_prefills:])
|
||||
|
||||
self._cached_decode_metadata = PlaceholderAttentionMetadata(
|
||||
num_prefills=0,
|
||||
num_prefill_tokens=0,
|
||||
num_decode_tokens=self.num_decode_tokens,
|
||||
slot_mapping=slot_mapping,
|
||||
multi_modal_placeholder_index_maps=None,
|
||||
enable_kv_scales_calculation=True,
|
||||
seq_lens=None,
|
||||
seq_lens_tensor=seq_lens_tensor,
|
||||
max_decode_query_len=self.max_decode_query_len,
|
||||
max_query_len=None,
|
||||
max_prefill_seq_len=0,
|
||||
max_decode_seq_len=self.max_decode_seq_len,
|
||||
query_start_loc=(self.query_start_loc[self.num_prefills:] -
|
||||
self.query_start_loc[self.num_prefills])
|
||||
if self.query_start_loc is not None else None,
|
||||
seq_start_loc=self.seq_start_loc[self.num_prefills:]
|
||||
if self.seq_start_loc is not None else None,
|
||||
context_lens_tensor=None,
|
||||
block_tables=block_tables,
|
||||
use_cuda_graph=self.use_cuda_graph,
|
||||
)
|
||||
return self._cached_decode_metadata
|
||||
|
||||
def advance_step(self,
|
||||
model_input: "ModelInputForGPUWithSamplingMetadata",
|
||||
sampled_token_ids: Optional[torch.Tensor],
|
||||
block_size: int,
|
||||
num_seqs: int,
|
||||
num_queries: int,
|
||||
turn_prefills_into_decodes: bool = False):
|
||||
"""
|
||||
Update metadata in-place to advance one decode step.
|
||||
"""
|
||||
# When using cudagraph, the num_seqs is padded to the next captured
|
||||
# batch sized, but num_queries tracks the actual number of requests in
|
||||
# the batch. For --enforce-eager mode, num_seqs == num_queries
|
||||
if num_seqs != num_queries:
|
||||
assert num_seqs > num_queries
|
||||
assert self.use_cuda_graph
|
||||
|
||||
assert not turn_prefills_into_decodes, \
|
||||
("Multi-Step + Chunked-Prefill is not supported for attention-free"
|
||||
"models. turn_prefills_into_decodes is a "
|
||||
"Multi-Step + Chunked-Prefill specific parameter.")
|
||||
|
||||
assert self.seq_lens is not None
|
||||
assert self.max_decode_seq_len == max(self.seq_lens)
|
||||
|
||||
assert self.num_prefills == 0
|
||||
assert self.num_prefill_tokens == 0
|
||||
assert self.num_decode_tokens == num_seqs
|
||||
|
||||
assert self.seq_lens is not None
|
||||
assert len(self.seq_lens) == num_seqs
|
||||
assert self.seq_lens_tensor is not None
|
||||
assert self.seq_lens_tensor.shape == (num_seqs, )
|
||||
assert self.max_query_len == 1
|
||||
assert self.max_prefill_seq_len == 0
|
||||
|
||||
assert self.query_start_loc is not None
|
||||
assert self.query_start_loc.shape == (num_queries + 1, )
|
||||
assert self.seq_start_loc is not None
|
||||
assert self.seq_start_loc.shape == (num_seqs + 1, )
|
||||
|
||||
assert self.context_lens_tensor is not None
|
||||
assert self.context_lens_tensor.shape == (num_queries, )
|
||||
|
||||
# Update query lengths. Note that we update only queries and not seqs,
|
||||
# since tensors may be padded due to captured cuda graph batch size
|
||||
for i in range(num_queries):
|
||||
self.seq_lens[i] += 1
|
||||
self.max_decode_seq_len = max(self.seq_lens)
|
||||
|
||||
# Update sequences, masking off entries greater than num_queries
|
||||
device = self.seq_lens_tensor.device
|
||||
mask = torch.arange(self.seq_lens_tensor.size(0),
|
||||
device=device) < num_queries
|
||||
self.seq_lens_tensor += mask.to(self.seq_lens_tensor.dtype)
|
||||
if sampled_token_ids is not None:
|
||||
model_input.input_tokens.masked_scatter_(
|
||||
mask, sampled_token_ids[:num_queries])
|
||||
|
||||
|
||||
class PlaceholderAttentionMetadataBuilder(
|
||||
AttentionMetadataBuilder[PlaceholderAttentionMetadata]):
|
||||
|
||||
def __init__(self, input_builder: "ModelInputForGPUBuilder"):
|
||||
|
||||
self.input_builder = input_builder
|
||||
self.runner = input_builder.runner
|
||||
|
||||
def prepare(self):
|
||||
self.prefill_seq_lens: List[int] = []
|
||||
self.context_lens: List[int] = []
|
||||
self.curr_seq_lens: List[int] = []
|
||||
self.multimodal_placeholder_maps: Dict[
|
||||
str,
|
||||
MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
|
||||
self.num_prefills = 0
|
||||
self.num_prefill_tokens = 0
|
||||
self.num_decode_tokens = 0
|
||||
|
||||
def _add_seq_group(
|
||||
self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
|
||||
chunked_prefill_enabled: bool):
|
||||
"""Add a sequence group to the metadata. Specifically update/append
|
||||
1. context length.
|
||||
"""
|
||||
is_prompt = inter_data.is_prompt
|
||||
|
||||
for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
|
||||
curr_sliding_window_block) in zip(
|
||||
inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
|
||||
inter_data.orig_seq_lens, inter_data.seq_lens,
|
||||
inter_data.query_lens, inter_data.context_lens,
|
||||
inter_data.curr_sliding_window_blocks):
|
||||
self.context_lens.append(context_len)
|
||||
|
||||
if is_prompt:
|
||||
mm_maps = inter_data.multi_modal_placeholder_maps
|
||||
if mm_maps:
|
||||
for modality, placeholders in mm_maps.items():
|
||||
self.multimodal_placeholder_maps[modality].extend(
|
||||
placeholders)
|
||||
|
||||
self.num_prefills += 1
|
||||
self.num_prefill_tokens += token_len
|
||||
self.prefill_seq_lens.append(seq_len)
|
||||
else:
|
||||
self.num_decode_tokens += query_len
|
||||
self.curr_seq_lens.append(curr_seq_len)
|
||||
|
||||
def build(self, seq_lens: List[int], query_lens: List[int],
|
||||
cuda_graph_pad_size: int, batch_size: int):
|
||||
"""Build attention metadata with on-device tensors.
|
||||
|
||||
Args:
|
||||
seq_lens: The maybe padded sequence lengths of the input sequences.
|
||||
query_lens: The query lengths of the input sequences.
|
||||
cuda_graph_pad_size: The padding size for cuda graph.
|
||||
-1 if cuda graph is not used.
|
||||
batch_size: The maybe padded batch size.
|
||||
"""
|
||||
|
||||
# Some input builders such as ModelInputForCPUBuilder do not have the
|
||||
# "inter_data_list" attribute.
|
||||
# Let's check inter_data_list exists before we reference it.
|
||||
if hasattr(self.input_builder, "inter_data_list"):
|
||||
for inter_data in self.input_builder.inter_data_list:
|
||||
self._add_seq_group(inter_data,
|
||||
self.input_builder.chunked_prefill_enabled)
|
||||
|
||||
device = self.runner.device
|
||||
use_captured_graph = cuda_graph_pad_size != -1
|
||||
|
||||
max_query_len = max(query_lens)
|
||||
decode_query_lens = query_lens[self.num_prefills:]
|
||||
if len(decode_query_lens) > 0:
|
||||
max_decode_query_len = max(decode_query_lens)
|
||||
else:
|
||||
max_decode_query_len = 1
|
||||
max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
|
||||
max_decode_seq_len = max(self.curr_seq_lens, default=0)
|
||||
num_decode_tokens = self.num_decode_tokens
|
||||
query_start_loc = list(accumulate(query_lens, initial=0))
|
||||
seq_start_loc = list(accumulate(seq_lens, initial=0))
|
||||
|
||||
if use_captured_graph:
|
||||
num_decode_tokens = batch_size - self.num_prefill_tokens
|
||||
assert max_query_len > 0, ("query_lens: {}".format(query_lens))
|
||||
|
||||
assert device is not None
|
||||
context_lens_tensor = async_tensor_h2d(self.context_lens, torch.int,
|
||||
device, self.runner.pin_memory)
|
||||
seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
|
||||
self.runner.pin_memory)
|
||||
query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32,
|
||||
device,
|
||||
self.runner.pin_memory)
|
||||
seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32,
|
||||
device, self.runner.pin_memory)
|
||||
|
||||
placeholder_index_maps = {
|
||||
modality: placeholder_map.index_map()
|
||||
for modality, placeholder_map in
|
||||
self.multimodal_placeholder_maps.items()
|
||||
}
|
||||
|
||||
# Placeholders
|
||||
slot_mapping_tensor = torch.empty(0)
|
||||
block_tables = torch.empty(0)
|
||||
|
||||
return PlaceholderAttentionMetadata(
|
||||
num_prefills=self.num_prefills,
|
||||
slot_mapping=slot_mapping_tensor,
|
||||
multi_modal_placeholder_index_maps=placeholder_index_maps,
|
||||
enable_kv_scales_calculation=True,
|
||||
num_prefill_tokens=self.num_prefill_tokens,
|
||||
num_decode_tokens=num_decode_tokens,
|
||||
seq_lens=seq_lens,
|
||||
seq_lens_tensor=seq_lens_tensor,
|
||||
max_query_len=max_query_len,
|
||||
max_decode_query_len=max_decode_query_len,
|
||||
max_prefill_seq_len=max_prefill_seq_len,
|
||||
max_decode_seq_len=max_decode_seq_len,
|
||||
query_start_loc=query_start_loc_tensor,
|
||||
seq_start_loc=seq_start_loc_tensor,
|
||||
context_lens_tensor=context_lens_tensor,
|
||||
block_tables=block_tables,
|
||||
use_cuda_graph=use_captured_graph,
|
||||
)
|
||||
|
||||
|
||||
class PlaceholderAttentionImpl(AttentionImpl):
|
||||
|
||||
def __init__(self, *args, **kwargs) -> None:
|
||||
return
|
||||
|
||||
def forward(self, *args, **kwargs) -> torch.Tensor:
|
||||
raise NotImplementedError
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user