Example file for docker compose and k8s (#1006)

2024-08-14 06:07:57 +08:00
parent 95f5fbf1a7
commit 312e849255
3 changed files with 116 additions and 1 deletions
--- a/README.md
+++ b/README.md
@@ -76,9 +76,17 @@ docker run --gpus all \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
-    python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --host 0.0.0.0 --port 30000
+    python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
 ```
 ### Method 4: Using docker compose
 > This method is recommended if you plan to serve it as a service.
 > A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
 1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
 2. Execute the command `docker compose up -d` in your terminal.
 ### Common Notes
 - If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
 - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
--- a/docker/compose.yaml
+++ b/docker/compose.yaml
@@ -0,0 +1,31 @@
 services:
  sglang:
    image: lmsysorg/sglang:latest
    container_name: sglang
    volumes:
      - ${HOME}/.cache/huggingface:/root/.cache/huggingface
    restart: always
    network_mode: host
    # Or you can only publish port 30000
    # ports:
    #   - 30000:30000
    environment:
      HF_TOKEN: <secret>
    entrypoint: python3 -m sglang.launch_server
    command:
      --model-path meta-llama/Meta-Llama-3.1-8B-Instruct
      --host 0.0.0.0
      --port 30000
    ulimits:
      memlock: -1
      stack: 67108864
    ipc: host
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://localhost:30000/health || exit 1"]
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ['0']
              capabilities: [gpu]
--- a/docker/k8s-sglang-service.yaml
+++ b/docker/k8s-sglang-service.yaml
@@ -0,0 +1,76 @@
 apiVersion: node.k8s.io/v1
 kind: RuntimeClass
 metadata:
  name: nvidia
 handler: nvidia
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: meta-llama-31-8b-instruct-sglang
 spec:
  replicas: 1
  strategy:
    type: Recreate
  selector:
    matchLabels:
      app: meta-llama-31-8b-instruct-sglang
  template:
    metadata:
      labels:
        app: meta-llama-31-8b-instruct-sglang
        model: meta-llama-31-8b-instruct
        engine: sglang
    spec:
      hostIPC: true
      restartPolicy: Always
      runtimeClassName: nvidia
      containers:
        - name: meta-llama-31-8b-instruct-sglang
          image: docker.io/lmsysorg/sglang:latest
          imagePullPolicy: Always  # IfNotPresent or Never
          ports:
            - containerPort: 30000
          command: ["python3", "-m", "sglang.launch_server"]
          args: ["--model-path", "meta-llama/Meta-Llama-3.1-8B-Instruct", "--host", "0.0.0.0", "--port", "30000"]
          env:
            - name: HF_TOKEN
              value: <secret>
          resources:
            limits:
              nvidia.com/gpu: 1
          volumeMounts:
            - name: hf-cache
              mountPath: /root/.cache/huggingface
              readOnly: true
            - name: localtime
              mountPath: /etc/localtime
              readOnly: true
          livenessProbe:
            httpGet:
              path: /health
              port: 30000
            initialDelaySeconds: 30
            periodSeconds: 10
      volumes:
        - name: hf-cache
          hostPath:
            path: /root/.cache/huggingface
            type: Directory
        - name: localtime
          hostPath:
            path: /etc/localtime
            type: File
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: meta-llama-31-8b-instruct-sglang
 spec:
  selector:
    app: meta-llama-31-8b-instruct-sglang
  ports:
    - protocol: TCP
      port: 30000  # port on host
      targetPort: 30000  # port in container
  type: LoadBalancer