diff --git a/.github/workflows/release-docker-amd.yml b/.github/workflows/release-docker-amd.yml index 228eecdb9..ffe2843d5 100644 --- a/.github/workflows/release-docker-amd.yml +++ b/.github/workflows/release-docker-amd.yml @@ -14,7 +14,7 @@ jobs: environment: 'prod' strategy: matrix: - rocm_version: ['6.2.0'] + rocm_version: ['6.3.0'] build_type: ['all', 'srt'] steps: - name: Checkout repository @@ -41,8 +41,8 @@ jobs: run: | version=$(cat python/sglang/version.py | cut -d'"' -f2) - if [ "${{ matrix.rocm_version }}" = "6.2.0" ]; then - rocm_tag="rocm620" + if [ "${{ matrix.rocm_version }}" = "6.3.0" ]; then + rocm_tag="rocm630" else echo "Unsupported ROCm version" exit 1 diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index e1a242c87..caa4666c8 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -1,8 +1,8 @@ # Usage (to build SGLang ROCm docker image): -# docker build --build-arg SGL_BRANCH=v0.4.2.post1 -t v0.4.2.post1-rocm620 -f Dockerfile.rocm . +# docker build --build-arg SGL_BRANCH=v0.4.2.post1 -t v0.4.2.post1-rocm630 -f Dockerfile.rocm . # default base image -ARG BASE_IMAGE="rocmshared/vllm-rocm:20250114-tuned-elementwise-layernorm" +ARG BASE_IMAGE="rocm/vllm-dev:20250114" FROM $BASE_IMAGE AS base USER root diff --git a/docs/developer/setup_github_runner.md b/docs/developer/setup_github_runner.md index 96c9cae01..cde8c0aa9 100644 --- a/docs/developer/setup_github_runner.md +++ b/docs/developer/setup_github_runner.md @@ -11,9 +11,9 @@ docker pull nvidia/cuda:12.1.1-devel-ubuntu22.04 # Nvidia docker run --shm-size 128g -it -v /tmp/huggingface:/hf_home --gpus all nvidia/cuda:12.1.1-devel-ubuntu22.04 /bin/bash # AMD -docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.4.2.post1-rocm620 /bin/bash +docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.4.2.post1-rocm630 /bin/bash # AMD just the last 2 GPUs -docker run --rm --device=/dev/kfd --device=/dev/dri/renderD176 --device=/dev/dri/renderD184 --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.4.2.post1-rocm620 /bin/bash +docker run --rm --device=/dev/kfd --device=/dev/dri/renderD176 --device=/dev/dri/renderD184 --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.4.2.post1-rocm630 /bin/bash ``` ### Step 2: Configure the runner by `config.sh` diff --git a/docs/start/install.md b/docs/start/install.md index a5012d6fc..b9702f021 100644 --- a/docs/start/install.md +++ b/docs/start/install.md @@ -54,7 +54,7 @@ docker run --gpus all \ Note: To AMD ROCm system with Instinct/MI GPUs, it is recommended to use `docker/Dockerfile.rocm` to build images, example and usage as below: ```bash -docker build --build-arg SGL_BRANCH=v0.4.2.post1 -t v0.4.2.post1-rocm620 -f Dockerfile.rocm . +docker build --build-arg SGL_BRANCH=v0.4.2.post1 -t v0.4.2.post1-rocm630 -f Dockerfile.rocm . alias drun='docker run -it --rm --network=host --device=/dev/kfd --device=/dev/dri --ipc=host \ --shm-size 16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ @@ -63,11 +63,11 @@ alias drun='docker run -it --rm --network=host --device=/dev/kfd --device=/dev/d drun -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=" \ - v0.4.2.post1-rocm620 \ + v0.4.2.post1-rocm630 \ python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000 # Till flashinfer backend available, --attention-backend triton --sampling-backend pytorch are set by default -drun v0.4.2.post1-rocm620 python3 -m sglang.bench_one_batch --batch-size 32 --input 1024 --output 128 --model amd/Meta-Llama-3.1-8B-Instruct-FP8-KV --tp 8 --quantization fp8 +drun v0.4.2.post1-rocm630 python3 -m sglang.bench_one_batch --batch-size 32 --input 1024 --output 128 --model amd/Meta-Llama-3.1-8B-Instruct-FP8-KV --tp 8 --quantization fp8 ``` ## Method 4: Using docker compose diff --git a/python/pyproject.toml b/python/pyproject.toml index d3d8c3f2a..cf997fc96 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -19,31 +19,29 @@ dependencies = ["requests", "tqdm", "numpy", "IPython", "setproctitle"] runtime_common = [ "aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular", "modelscope", - "orjson", "outlines>=0.0.44,<0.1.0", - "packaging", "pillow", "prometheus-client>=0.20.0", - "psutil", "pydantic", "python-multipart", - "pyzmq>=25.1.2", "torchao>=0.7.0", "uvicorn", "uvloop", - "xgrammar>=0.1.10" + "orjson", "packaging", "pillow", "prometheus-client>=0.20.0", + "psutil", "pydantic", "python-multipart", "pyzmq>=25.1.2", + "torchao>=0.7.0", "uvicorn", "uvloop", "xgrammar>=0.1.10" ] srt = [ "sglang[runtime_common]", "cuda-python", "sgl-kernel>=0.0.3.post1", "torch", "vllm==0.6.4.post1", - "flashinfer==0.1.6" + "flashinfer==0.1.6", "outlines>=0.0.44,<0.1.0" ] # HIP (Heterogeneous-computing Interface for Portability) for AMD # => base docker rocm/vllm-dev:20241022, not from public vllm whl -srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.post2.dev1"] +srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.7.dev2", "outlines==0.1.11"] # xpu is not enabled in public vllm and torch whl, # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm -srt_xpu = ["sglang[runtime_common]"] +srt_xpu = ["sglang[runtime_common]", "outlines>=0.0.44,<0.1.0"] #For Intel Gaudi(device : hpu) follow the installation guide #https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html -srt_hpu = ["sglang[runtime_common]"] +srt_hpu = ["sglang[runtime_common]", "outlines>=0.0.44,<0.1.0"] # CPU: currently, there are no pre-built vllm wheels for CPU. # To install vllm for CPU, please follow the instruction here: # https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html -srt_cpu = ["sglang[runtime_common]", "torch"] +srt_cpu = ["sglang[runtime_common]", "torch", "outlines>=0.0.44,<0.1.0"] openai = ["openai>=1.0", "tiktoken"] anthropic = ["anthropic>=0.20.0"] diff --git a/python/sglang/srt/constrained/outlines_backend.py b/python/sglang/srt/constrained/outlines_backend.py index 4820d4739..91dbcba24 100644 --- a/python/sglang/srt/constrained/outlines_backend.py +++ b/python/sglang/srt/constrained/outlines_backend.py @@ -20,7 +20,6 @@ from typing import Dict, List, Optional, Tuple, Union import interegular import torch from outlines.fsm.guide import RegexGuide -from outlines.fsm.json_schema import build_regex_from_schema from outlines.models.transformers import TransformerTokenizer from pydantic import BaseModel @@ -29,6 +28,15 @@ from sglang.srt.constrained.base_grammar_backend import ( BaseGrammarObject, ) from sglang.srt.constrained.outlines_jump_forward import OutlinesJumpForwardMap +from sglang.srt.utils import is_hip + +is_hip_ = is_hip() + +if is_hip_: + from outlines_core.fsm.json_schema import build_regex_from_schema +else: + from outlines.fsm.json_schema import build_regex_from_schema + logger = logging.getLogger(__name__) diff --git a/python/sglang/srt/custom_op.py b/python/sglang/srt/custom_op.py index a702e8f82..c35790691 100644 --- a/python/sglang/srt/custom_op.py +++ b/python/sglang/srt/custom_op.py @@ -20,7 +20,7 @@ class CustomOp(nn.Module): raise NotImplementedError def forward_hip(self, *args, **kwargs): - raise NotImplementedError + return self.forward_native(*args, **kwargs) def forward_xpu(self, *args, **kwargs): return self.forward_native(*args, **kwargs)