ROCm: bump 6.3.0 (#3259)
This commit is contained in:
6
.github/workflows/release-docker-amd.yml
vendored
6
.github/workflows/release-docker-amd.yml
vendored
@@ -14,7 +14,7 @@ jobs:
|
|||||||
environment: 'prod'
|
environment: 'prod'
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
rocm_version: ['6.2.0']
|
rocm_version: ['6.3.0']
|
||||||
build_type: ['all', 'srt']
|
build_type: ['all', 'srt']
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
@@ -41,8 +41,8 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
version=$(cat python/sglang/version.py | cut -d'"' -f2)
|
version=$(cat python/sglang/version.py | cut -d'"' -f2)
|
||||||
|
|
||||||
if [ "${{ matrix.rocm_version }}" = "6.2.0" ]; then
|
if [ "${{ matrix.rocm_version }}" = "6.3.0" ]; then
|
||||||
rocm_tag="rocm620"
|
rocm_tag="rocm630"
|
||||||
else
|
else
|
||||||
echo "Unsupported ROCm version"
|
echo "Unsupported ROCm version"
|
||||||
exit 1
|
exit 1
|
||||||
|
|||||||
@@ -1,8 +1,8 @@
|
|||||||
# Usage (to build SGLang ROCm docker image):
|
# Usage (to build SGLang ROCm docker image):
|
||||||
# docker build --build-arg SGL_BRANCH=v0.4.2.post1 -t v0.4.2.post1-rocm620 -f Dockerfile.rocm .
|
# docker build --build-arg SGL_BRANCH=v0.4.2.post1 -t v0.4.2.post1-rocm630 -f Dockerfile.rocm .
|
||||||
|
|
||||||
# default base image
|
# default base image
|
||||||
ARG BASE_IMAGE="rocmshared/vllm-rocm:20250114-tuned-elementwise-layernorm"
|
ARG BASE_IMAGE="rocm/vllm-dev:20250114"
|
||||||
|
|
||||||
FROM $BASE_IMAGE AS base
|
FROM $BASE_IMAGE AS base
|
||||||
USER root
|
USER root
|
||||||
|
|||||||
@@ -11,9 +11,9 @@ docker pull nvidia/cuda:12.1.1-devel-ubuntu22.04
|
|||||||
# Nvidia
|
# Nvidia
|
||||||
docker run --shm-size 128g -it -v /tmp/huggingface:/hf_home --gpus all nvidia/cuda:12.1.1-devel-ubuntu22.04 /bin/bash
|
docker run --shm-size 128g -it -v /tmp/huggingface:/hf_home --gpus all nvidia/cuda:12.1.1-devel-ubuntu22.04 /bin/bash
|
||||||
# AMD
|
# AMD
|
||||||
docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.4.2.post1-rocm620 /bin/bash
|
docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.4.2.post1-rocm630 /bin/bash
|
||||||
# AMD just the last 2 GPUs
|
# AMD just the last 2 GPUs
|
||||||
docker run --rm --device=/dev/kfd --device=/dev/dri/renderD176 --device=/dev/dri/renderD184 --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.4.2.post1-rocm620 /bin/bash
|
docker run --rm --device=/dev/kfd --device=/dev/dri/renderD176 --device=/dev/dri/renderD184 --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.4.2.post1-rocm630 /bin/bash
|
||||||
```
|
```
|
||||||
|
|
||||||
### Step 2: Configure the runner by `config.sh`
|
### Step 2: Configure the runner by `config.sh`
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ docker run --gpus all \
|
|||||||
Note: To AMD ROCm system with Instinct/MI GPUs, it is recommended to use `docker/Dockerfile.rocm` to build images, example and usage as below:
|
Note: To AMD ROCm system with Instinct/MI GPUs, it is recommended to use `docker/Dockerfile.rocm` to build images, example and usage as below:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker build --build-arg SGL_BRANCH=v0.4.2.post1 -t v0.4.2.post1-rocm620 -f Dockerfile.rocm .
|
docker build --build-arg SGL_BRANCH=v0.4.2.post1 -t v0.4.2.post1-rocm630 -f Dockerfile.rocm .
|
||||||
|
|
||||||
alias drun='docker run -it --rm --network=host --device=/dev/kfd --device=/dev/dri --ipc=host \
|
alias drun='docker run -it --rm --network=host --device=/dev/kfd --device=/dev/dri --ipc=host \
|
||||||
--shm-size 16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
|
--shm-size 16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
|
||||||
@@ -63,11 +63,11 @@ alias drun='docker run -it --rm --network=host --device=/dev/kfd --device=/dev/d
|
|||||||
drun -p 30000:30000 \
|
drun -p 30000:30000 \
|
||||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||||
--env "HF_TOKEN=<secret>" \
|
--env "HF_TOKEN=<secret>" \
|
||||||
v0.4.2.post1-rocm620 \
|
v0.4.2.post1-rocm630 \
|
||||||
python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
|
python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
|
||||||
|
|
||||||
# Till flashinfer backend available, --attention-backend triton --sampling-backend pytorch are set by default
|
# Till flashinfer backend available, --attention-backend triton --sampling-backend pytorch are set by default
|
||||||
drun v0.4.2.post1-rocm620 python3 -m sglang.bench_one_batch --batch-size 32 --input 1024 --output 128 --model amd/Meta-Llama-3.1-8B-Instruct-FP8-KV --tp 8 --quantization fp8
|
drun v0.4.2.post1-rocm630 python3 -m sglang.bench_one_batch --batch-size 32 --input 1024 --output 128 --model amd/Meta-Llama-3.1-8B-Instruct-FP8-KV --tp 8 --quantization fp8
|
||||||
```
|
```
|
||||||
|
|
||||||
## Method 4: Using docker compose
|
## Method 4: Using docker compose
|
||||||
|
|||||||
@@ -19,31 +19,29 @@ dependencies = ["requests", "tqdm", "numpy", "IPython", "setproctitle"]
|
|||||||
runtime_common = [
|
runtime_common = [
|
||||||
"aiohttp", "decord", "fastapi",
|
"aiohttp", "decord", "fastapi",
|
||||||
"hf_transfer", "huggingface_hub", "interegular", "modelscope",
|
"hf_transfer", "huggingface_hub", "interegular", "modelscope",
|
||||||
"orjson", "outlines>=0.0.44,<0.1.0",
|
"orjson", "packaging", "pillow", "prometheus-client>=0.20.0",
|
||||||
"packaging", "pillow", "prometheus-client>=0.20.0",
|
"psutil", "pydantic", "python-multipart", "pyzmq>=25.1.2",
|
||||||
"psutil", "pydantic", "python-multipart",
|
"torchao>=0.7.0", "uvicorn", "uvloop", "xgrammar>=0.1.10"
|
||||||
"pyzmq>=25.1.2", "torchao>=0.7.0", "uvicorn", "uvloop",
|
|
||||||
"xgrammar>=0.1.10"
|
|
||||||
]
|
]
|
||||||
srt = [
|
srt = [
|
||||||
"sglang[runtime_common]", "cuda-python",
|
"sglang[runtime_common]", "cuda-python",
|
||||||
"sgl-kernel>=0.0.3.post1", "torch", "vllm==0.6.4.post1",
|
"sgl-kernel>=0.0.3.post1", "torch", "vllm==0.6.4.post1",
|
||||||
"flashinfer==0.1.6"
|
"flashinfer==0.1.6", "outlines>=0.0.44,<0.1.0"
|
||||||
]
|
]
|
||||||
|
|
||||||
# HIP (Heterogeneous-computing Interface for Portability) for AMD
|
# HIP (Heterogeneous-computing Interface for Portability) for AMD
|
||||||
# => base docker rocm/vllm-dev:20241022, not from public vllm whl
|
# => base docker rocm/vllm-dev:20241022, not from public vllm whl
|
||||||
srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.post2.dev1"]
|
srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.7.dev2", "outlines==0.1.11"]
|
||||||
# xpu is not enabled in public vllm and torch whl,
|
# xpu is not enabled in public vllm and torch whl,
|
||||||
# need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
|
# need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
|
||||||
srt_xpu = ["sglang[runtime_common]"]
|
srt_xpu = ["sglang[runtime_common]", "outlines>=0.0.44,<0.1.0"]
|
||||||
#For Intel Gaudi(device : hpu) follow the installation guide
|
#For Intel Gaudi(device : hpu) follow the installation guide
|
||||||
#https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
|
#https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
|
||||||
srt_hpu = ["sglang[runtime_common]"]
|
srt_hpu = ["sglang[runtime_common]", "outlines>=0.0.44,<0.1.0"]
|
||||||
# CPU: currently, there are no pre-built vllm wheels for CPU.
|
# CPU: currently, there are no pre-built vllm wheels for CPU.
|
||||||
# To install vllm for CPU, please follow the instruction here:
|
# To install vllm for CPU, please follow the instruction here:
|
||||||
# https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html
|
# https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html
|
||||||
srt_cpu = ["sglang[runtime_common]", "torch"]
|
srt_cpu = ["sglang[runtime_common]", "torch", "outlines>=0.0.44,<0.1.0"]
|
||||||
|
|
||||||
openai = ["openai>=1.0", "tiktoken"]
|
openai = ["openai>=1.0", "tiktoken"]
|
||||||
anthropic = ["anthropic>=0.20.0"]
|
anthropic = ["anthropic>=0.20.0"]
|
||||||
|
|||||||
@@ -20,7 +20,6 @@ from typing import Dict, List, Optional, Tuple, Union
|
|||||||
import interegular
|
import interegular
|
||||||
import torch
|
import torch
|
||||||
from outlines.fsm.guide import RegexGuide
|
from outlines.fsm.guide import RegexGuide
|
||||||
from outlines.fsm.json_schema import build_regex_from_schema
|
|
||||||
from outlines.models.transformers import TransformerTokenizer
|
from outlines.models.transformers import TransformerTokenizer
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
@@ -29,6 +28,15 @@ from sglang.srt.constrained.base_grammar_backend import (
|
|||||||
BaseGrammarObject,
|
BaseGrammarObject,
|
||||||
)
|
)
|
||||||
from sglang.srt.constrained.outlines_jump_forward import OutlinesJumpForwardMap
|
from sglang.srt.constrained.outlines_jump_forward import OutlinesJumpForwardMap
|
||||||
|
from sglang.srt.utils import is_hip
|
||||||
|
|
||||||
|
is_hip_ = is_hip()
|
||||||
|
|
||||||
|
if is_hip_:
|
||||||
|
from outlines_core.fsm.json_schema import build_regex_from_schema
|
||||||
|
else:
|
||||||
|
from outlines.fsm.json_schema import build_regex_from_schema
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ class CustomOp(nn.Module):
|
|||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def forward_hip(self, *args, **kwargs):
|
def forward_hip(self, *args, **kwargs):
|
||||||
raise NotImplementedError
|
return self.forward_native(*args, **kwargs)
|
||||||
|
|
||||||
def forward_xpu(self, *args, **kwargs):
|
def forward_xpu(self, *args, **kwargs):
|
||||||
return self.forward_native(*args, **kwargs)
|
return self.forward_native(*args, **kwargs)
|
||||||
|
|||||||
Reference in New Issue
Block a user