From 312e8492556dd092368452f349ed45af3e3a68b6 Mon Sep 17 00:00:00 2001 From: Lucien Date: Wed, 14 Aug 2024 06:07:57 +0800 Subject: [PATCH] Example file for docker compose and k8s (#1006) --- README.md | 10 ++++- docker/compose.yaml | 31 ++++++++++++++ docker/k8s-sglang-service.yaml | 76 ++++++++++++++++++++++++++++++++++ 3 files changed, 116 insertions(+), 1 deletion(-) create mode 100644 docker/compose.yaml create mode 100644 docker/k8s-sglang-service.yaml diff --git a/README.md b/README.md index 59f72bf12..117c329bb 100644 --- a/README.md +++ b/README.md @@ -76,9 +76,17 @@ docker run --gpus all \ --env "HF_TOKEN=" \ --ipc=host \ lmsysorg/sglang:latest \ - python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --host 0.0.0.0 --port 30000 + python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000 ``` +### Method 4: Using docker compose + +> This method is recommended if you plan to serve it as a service. +> A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml). + +1. Copy the [compose.yml](./docker/compose.yaml) to your local machine +2. Execute the command `docker compose up -d` in your terminal. + ### Common Notes - If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server. - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`. diff --git a/docker/compose.yaml b/docker/compose.yaml new file mode 100644 index 000000000..f2da3a416 --- /dev/null +++ b/docker/compose.yaml @@ -0,0 +1,31 @@ +services: + sglang: + image: lmsysorg/sglang:latest + container_name: sglang + volumes: + - ${HOME}/.cache/huggingface:/root/.cache/huggingface + restart: always + network_mode: host + # Or you can only publish port 30000 + # ports: + # - 30000:30000 + environment: + HF_TOKEN: + entrypoint: python3 -m sglang.launch_server + command: + --model-path meta-llama/Meta-Llama-3.1-8B-Instruct + --host 0.0.0.0 + --port 30000 + ulimits: + memlock: -1 + stack: 67108864 + ipc: host + healthcheck: + test: ["CMD-SHELL", "curl -f http://localhost:30000/health || exit 1"] + deploy: + resources: + reservations: + devices: + - driver: nvidia + device_ids: ['0'] + capabilities: [gpu] diff --git a/docker/k8s-sglang-service.yaml b/docker/k8s-sglang-service.yaml new file mode 100644 index 000000000..c217f356a --- /dev/null +++ b/docker/k8s-sglang-service.yaml @@ -0,0 +1,76 @@ +apiVersion: node.k8s.io/v1 +kind: RuntimeClass +metadata: + name: nvidia +handler: nvidia +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: meta-llama-31-8b-instruct-sglang +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + app: meta-llama-31-8b-instruct-sglang + template: + metadata: + labels: + app: meta-llama-31-8b-instruct-sglang + model: meta-llama-31-8b-instruct + engine: sglang + spec: + hostIPC: true + restartPolicy: Always + runtimeClassName: nvidia + containers: + - name: meta-llama-31-8b-instruct-sglang + image: docker.io/lmsysorg/sglang:latest + imagePullPolicy: Always # IfNotPresent or Never + ports: + - containerPort: 30000 + command: ["python3", "-m", "sglang.launch_server"] + args: ["--model-path", "meta-llama/Meta-Llama-3.1-8B-Instruct", "--host", "0.0.0.0", "--port", "30000"] + env: + - name: HF_TOKEN + value: + resources: + limits: + nvidia.com/gpu: 1 + volumeMounts: + - name: hf-cache + mountPath: /root/.cache/huggingface + readOnly: true + - name: localtime + mountPath: /etc/localtime + readOnly: true + livenessProbe: + httpGet: + path: /health + port: 30000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumes: + - name: hf-cache + hostPath: + path: /root/.cache/huggingface + type: Directory + - name: localtime + hostPath: + path: /etc/localtime + type: File +--- +apiVersion: v1 +kind: Service +metadata: + name: meta-llama-31-8b-instruct-sglang +spec: + selector: + app: meta-llama-31-8b-instruct-sglang + ports: + - protocol: TCP + port: 30000 # port on host + targetPort: 30000 # port in container + type: LoadBalancer