From 312e8492556dd092368452f349ed45af3e3a68b6 Mon Sep 17 00:00:00 2001
From: Lucien <lucien@lucien.ink>
Date: Wed, 14 Aug 2024 06:07:57 +0800
Subject: [PATCH] Example file for docker compose and k8s (#1006)

---
 README.md                      | 10 ++++-
 docker/compose.yaml            | 31 ++++++++++++++
 docker/k8s-sglang-service.yaml | 76 ++++++++++++++++++++++++++++++++++
 3 files changed, 116 insertions(+), 1 deletion(-)
 create mode 100644 docker/compose.yaml
 create mode 100644 docker/k8s-sglang-service.yaml
diff --git a/README.md b/README.md
index 59f72bf12..117c329bb 100644
--- a/README.md
+++ b/README.md
@@ -76,9 +76,17 @@ docker run --gpus all \
     --env "HF_TOKEN=<secret>" \
     --ipc=host \
     lmsysorg/sglang:latest \
-    python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --host 0.0.0.0 --port 30000
+    python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
 ```
 
+### Method 4: Using docker compose
+
+> This method is recommended if you plan to serve it as a service.
+> A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
+
+1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
+2. Execute the command `docker compose up -d` in your terminal.
+
 ### Common Notes
 - If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
 - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
diff --git a/docker/compose.yaml b/docker/compose.yaml
new file mode 100644
index 000000000..f2da3a416
--- /dev/null
+++ b/docker/compose.yaml
@@ -0,0 +1,31 @@
+services:
+  sglang:
+    image: lmsysorg/sglang:latest
+    container_name: sglang
+    volumes:
+      - ${HOME}/.cache/huggingface:/root/.cache/huggingface
+    restart: always
+    network_mode: host
+    # Or you can only publish port 30000
+    # ports:
+    #   - 30000:30000
+    environment:
+      HF_TOKEN: <secret>
+    entrypoint: python3 -m sglang.launch_server
+    command:
+      --model-path meta-llama/Meta-Llama-3.1-8B-Instruct
+      --host 0.0.0.0
+      --port 30000
+    ulimits:
+      memlock: -1
+      stack: 67108864
+    ipc: host
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://localhost:30000/health || exit 1"]
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ['0']
+              capabilities: [gpu]
diff --git a/docker/k8s-sglang-service.yaml b/docker/k8s-sglang-service.yaml
new file mode 100644
index 000000000..c217f356a
--- /dev/null
+++ b/docker/k8s-sglang-service.yaml
@@ -0,0 +1,76 @@
+apiVersion: node.k8s.io/v1
+kind: RuntimeClass
+metadata:
+  name: nvidia
+handler: nvidia
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: meta-llama-31-8b-instruct-sglang
+spec:
+  replicas: 1
+  strategy:
+    type: Recreate
+  selector:
+    matchLabels:
+      app: meta-llama-31-8b-instruct-sglang
+  template:
+    metadata:
+      labels:
+        app: meta-llama-31-8b-instruct-sglang
+        model: meta-llama-31-8b-instruct
+        engine: sglang
+    spec:
+      hostIPC: true
+      restartPolicy: Always
+      runtimeClassName: nvidia
+      containers:
+        - name: meta-llama-31-8b-instruct-sglang
+          image: docker.io/lmsysorg/sglang:latest
+          imagePullPolicy: Always  # IfNotPresent or Never
+          ports:
+            - containerPort: 30000
+          command: ["python3", "-m", "sglang.launch_server"]
+          args: ["--model-path", "meta-llama/Meta-Llama-3.1-8B-Instruct", "--host", "0.0.0.0", "--port", "30000"]
+          env:
+            - name: HF_TOKEN
+              value: <secret>
+          resources:
+            limits:
+              nvidia.com/gpu: 1
+          volumeMounts:
+            - name: hf-cache
+              mountPath: /root/.cache/huggingface
+              readOnly: true
+            - name: localtime
+              mountPath: /etc/localtime
+              readOnly: true
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: 30000
+            initialDelaySeconds: 30
+            periodSeconds: 10
+      volumes:
+        - name: hf-cache
+          hostPath:
+            path: /root/.cache/huggingface
+            type: Directory
+        - name: localtime
+          hostPath:
+            path: /etc/localtime
+            type: File
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: meta-llama-31-8b-instruct-sglang
+spec:
+  selector:
+    app: meta-llama-31-8b-instruct-sglang
+  ports:
+    - protocol: TCP
+      port: 30000  # port on host
+      targetPort: 30000  # port in container
+  type: LoadBalancer