Add PVC and update resource limits in k8s config (#8489)

2025-07-29 14:15:31 +08:00
parent fb16fbaf52
commit 2e1d2d7e66
1 changed files with 47 additions and 11 deletions
--- a/docker/k8s-sglang-service.yaml
+++ b/docker/k8s-sglang-service.yaml
@@ -1,3 +1,16 @@
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
  name: llama-31-8b-sglang
 spec:
  accessModes:
    - ReadWriteMany
  resources:
    requests:
      storage: 30Gi
  storageClassName: default # change this to your preferred storage class
  volumeMode: Filesystem
 ---
 apiVersion: node.k8s.io/v1
 kind: RuntimeClass
 metadata:
@@ -27,23 +40,36 @@ spec:
      containers:
        - name: meta-llama-31-8b-instruct-sglang
          image: docker.io/lmsysorg/sglang:latest
-          imagePullPolicy: Always  # IfNotPresent or Never
+          imagePullPolicy: Always # IfNotPresent or Never
          ports:
            - containerPort: 30000
          command: ["python3", "-m", "sglang.launch_server"]
-          args: ["--model-path", "meta-llama/Llama-3.1-8B-Instruct", "--host", "0.0.0.0", "--port", "30000"]
+          args:
            [
              "--model-path",
              "meta-llama/Llama-3.1-8B-Instruct",
              "--host",
              "0.0.0.0",
              "--port",
              "30000",
            ]
          env:
            - name: HF_TOKEN
              value: <secret>
          resources:
            limits:
              nvidia.com/gpu: 1
              cpu: 8
              memory: 40Gi
            requests:
              cpu: 2
              memory: 16Gi
              nvidia.com/gpu: 1
          volumeMounts:
            - name: shm
              mountPath: /dev/shm
            - name: hf-cache
              mountPath: /root/.cache/huggingface
              readOnly: true
            - name: localtime
              mountPath: /etc/localtime
              readOnly: true
@@ -51,17 +77,27 @@ spec:
            httpGet:
              path: /health
              port: 30000
-            initialDelaySeconds: 30
+            initialDelaySeconds: 120
-            periodSeconds: 10
+            periodSeconds: 15
            timeoutSeconds: 10
            failureThreshold: 3
          readinessProbe:
            httpGet:
              path: /health_generate
              port: 30000
            initialDelaySeconds: 120
            periodSeconds: 15
            timeoutSeconds: 10
            failureThreshold: 3
            successThreshold: 1
      volumes:
        - name: shm
          emptyDir:
            medium: Memory
            sizeLimit: 10Gi
        - name: hf-cache
-          hostPath:
+          persistentVolumeClaim:
-            path: /root/.cache/huggingface
+            claimName: llama-31-8b-sglang
            type: Directory
        - name: localtime
          hostPath:
            path: /etc/localtime
@@ -76,6 +112,6 @@ spec:
    app: meta-llama-31-8b-instruct-sglang
  ports:
    - protocol: TCP
-      port: 30000  # port on host
+      port: 80 # port on host
-      targetPort: 30000  # port in container
+      targetPort: 30000 # port in container
-  type: LoadBalancer
+  type: LoadBalancer # change to ClusterIP if needed