Example file for docker compose and k8s (#1006)
This commit is contained in:
10
README.md
10
README.md
@@ -76,9 +76,17 @@ docker run --gpus all \
|
||||
--env "HF_TOKEN=<secret>" \
|
||||
--ipc=host \
|
||||
lmsysorg/sglang:latest \
|
||||
python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --host 0.0.0.0 --port 30000
|
||||
python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
|
||||
```
|
||||
|
||||
### Method 4: Using docker compose
|
||||
|
||||
> This method is recommended if you plan to serve it as a service.
|
||||
> A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
|
||||
|
||||
1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
|
||||
2. Execute the command `docker compose up -d` in your terminal.
|
||||
|
||||
### Common Notes
|
||||
- If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
|
||||
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
||||
|
||||
31
docker/compose.yaml
Normal file
31
docker/compose.yaml
Normal file
@@ -0,0 +1,31 @@
|
||||
services:
|
||||
sglang:
|
||||
image: lmsysorg/sglang:latest
|
||||
container_name: sglang
|
||||
volumes:
|
||||
- ${HOME}/.cache/huggingface:/root/.cache/huggingface
|
||||
restart: always
|
||||
network_mode: host
|
||||
# Or you can only publish port 30000
|
||||
# ports:
|
||||
# - 30000:30000
|
||||
environment:
|
||||
HF_TOKEN: <secret>
|
||||
entrypoint: python3 -m sglang.launch_server
|
||||
command:
|
||||
--model-path meta-llama/Meta-Llama-3.1-8B-Instruct
|
||||
--host 0.0.0.0
|
||||
--port 30000
|
||||
ulimits:
|
||||
memlock: -1
|
||||
stack: 67108864
|
||||
ipc: host
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl -f http://localhost:30000/health || exit 1"]
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
device_ids: ['0']
|
||||
capabilities: [gpu]
|
||||
76
docker/k8s-sglang-service.yaml
Normal file
76
docker/k8s-sglang-service.yaml
Normal file
@@ -0,0 +1,76 @@
|
||||
apiVersion: node.k8s.io/v1
|
||||
kind: RuntimeClass
|
||||
metadata:
|
||||
name: nvidia
|
||||
handler: nvidia
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: meta-llama-31-8b-instruct-sglang
|
||||
spec:
|
||||
replicas: 1
|
||||
strategy:
|
||||
type: Recreate
|
||||
selector:
|
||||
matchLabels:
|
||||
app: meta-llama-31-8b-instruct-sglang
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: meta-llama-31-8b-instruct-sglang
|
||||
model: meta-llama-31-8b-instruct
|
||||
engine: sglang
|
||||
spec:
|
||||
hostIPC: true
|
||||
restartPolicy: Always
|
||||
runtimeClassName: nvidia
|
||||
containers:
|
||||
- name: meta-llama-31-8b-instruct-sglang
|
||||
image: docker.io/lmsysorg/sglang:latest
|
||||
imagePullPolicy: Always # IfNotPresent or Never
|
||||
ports:
|
||||
- containerPort: 30000
|
||||
command: ["python3", "-m", "sglang.launch_server"]
|
||||
args: ["--model-path", "meta-llama/Meta-Llama-3.1-8B-Instruct", "--host", "0.0.0.0", "--port", "30000"]
|
||||
env:
|
||||
- name: HF_TOKEN
|
||||
value: <secret>
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 1
|
||||
volumeMounts:
|
||||
- name: hf-cache
|
||||
mountPath: /root/.cache/huggingface
|
||||
readOnly: true
|
||||
- name: localtime
|
||||
mountPath: /etc/localtime
|
||||
readOnly: true
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 30000
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 10
|
||||
volumes:
|
||||
- name: hf-cache
|
||||
hostPath:
|
||||
path: /root/.cache/huggingface
|
||||
type: Directory
|
||||
- name: localtime
|
||||
hostPath:
|
||||
path: /etc/localtime
|
||||
type: File
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: meta-llama-31-8b-instruct-sglang
|
||||
spec:
|
||||
selector:
|
||||
app: meta-llama-31-8b-instruct-sglang
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 30000 # port on host
|
||||
targetPort: 30000 # port in container
|
||||
type: LoadBalancer
|
||||
Reference in New Issue
Block a user