Example file for docker compose and k8s (#1006)
This commit is contained in:
10
README.md
10
README.md
@@ -76,9 +76,17 @@ docker run --gpus all \
|
|||||||
--env "HF_TOKEN=<secret>" \
|
--env "HF_TOKEN=<secret>" \
|
||||||
--ipc=host \
|
--ipc=host \
|
||||||
lmsysorg/sglang:latest \
|
lmsysorg/sglang:latest \
|
||||||
python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --host 0.0.0.0 --port 30000
|
python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Method 4: Using docker compose
|
||||||
|
|
||||||
|
> This method is recommended if you plan to serve it as a service.
|
||||||
|
> A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
|
||||||
|
|
||||||
|
1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
|
||||||
|
2. Execute the command `docker compose up -d` in your terminal.
|
||||||
|
|
||||||
### Common Notes
|
### Common Notes
|
||||||
- If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
|
- If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
|
||||||
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
||||||
|
|||||||
31
docker/compose.yaml
Normal file
31
docker/compose.yaml
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
services:
|
||||||
|
sglang:
|
||||||
|
image: lmsysorg/sglang:latest
|
||||||
|
container_name: sglang
|
||||||
|
volumes:
|
||||||
|
- ${HOME}/.cache/huggingface:/root/.cache/huggingface
|
||||||
|
restart: always
|
||||||
|
network_mode: host
|
||||||
|
# Or you can only publish port 30000
|
||||||
|
# ports:
|
||||||
|
# - 30000:30000
|
||||||
|
environment:
|
||||||
|
HF_TOKEN: <secret>
|
||||||
|
entrypoint: python3 -m sglang.launch_server
|
||||||
|
command:
|
||||||
|
--model-path meta-llama/Meta-Llama-3.1-8B-Instruct
|
||||||
|
--host 0.0.0.0
|
||||||
|
--port 30000
|
||||||
|
ulimits:
|
||||||
|
memlock: -1
|
||||||
|
stack: 67108864
|
||||||
|
ipc: host
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "curl -f http://localhost:30000/health || exit 1"]
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices:
|
||||||
|
- driver: nvidia
|
||||||
|
device_ids: ['0']
|
||||||
|
capabilities: [gpu]
|
||||||
76
docker/k8s-sglang-service.yaml
Normal file
76
docker/k8s-sglang-service.yaml
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
apiVersion: node.k8s.io/v1
|
||||||
|
kind: RuntimeClass
|
||||||
|
metadata:
|
||||||
|
name: nvidia
|
||||||
|
handler: nvidia
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: meta-llama-31-8b-instruct-sglang
|
||||||
|
spec:
|
||||||
|
replicas: 1
|
||||||
|
strategy:
|
||||||
|
type: Recreate
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: meta-llama-31-8b-instruct-sglang
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: meta-llama-31-8b-instruct-sglang
|
||||||
|
model: meta-llama-31-8b-instruct
|
||||||
|
engine: sglang
|
||||||
|
spec:
|
||||||
|
hostIPC: true
|
||||||
|
restartPolicy: Always
|
||||||
|
runtimeClassName: nvidia
|
||||||
|
containers:
|
||||||
|
- name: meta-llama-31-8b-instruct-sglang
|
||||||
|
image: docker.io/lmsysorg/sglang:latest
|
||||||
|
imagePullPolicy: Always # IfNotPresent or Never
|
||||||
|
ports:
|
||||||
|
- containerPort: 30000
|
||||||
|
command: ["python3", "-m", "sglang.launch_server"]
|
||||||
|
args: ["--model-path", "meta-llama/Meta-Llama-3.1-8B-Instruct", "--host", "0.0.0.0", "--port", "30000"]
|
||||||
|
env:
|
||||||
|
- name: HF_TOKEN
|
||||||
|
value: <secret>
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
nvidia.com/gpu: 1
|
||||||
|
volumeMounts:
|
||||||
|
- name: hf-cache
|
||||||
|
mountPath: /root/.cache/huggingface
|
||||||
|
readOnly: true
|
||||||
|
- name: localtime
|
||||||
|
mountPath: /etc/localtime
|
||||||
|
readOnly: true
|
||||||
|
livenessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /health
|
||||||
|
port: 30000
|
||||||
|
initialDelaySeconds: 30
|
||||||
|
periodSeconds: 10
|
||||||
|
volumes:
|
||||||
|
- name: hf-cache
|
||||||
|
hostPath:
|
||||||
|
path: /root/.cache/huggingface
|
||||||
|
type: Directory
|
||||||
|
- name: localtime
|
||||||
|
hostPath:
|
||||||
|
path: /etc/localtime
|
||||||
|
type: File
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: meta-llama-31-8b-instruct-sglang
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
app: meta-llama-31-8b-instruct-sglang
|
||||||
|
ports:
|
||||||
|
- protocol: TCP
|
||||||
|
port: 30000 # port on host
|
||||||
|
targetPort: 30000 # port in container
|
||||||
|
type: LoadBalancer
|
||||||
Reference in New Issue
Block a user