diff --git a/docker/k8s-sglang-service.yaml b/docker/k8s-sglang-service.yaml index 6f6f9af0b..866d50be9 100644 --- a/docker/k8s-sglang-service.yaml +++ b/docker/k8s-sglang-service.yaml @@ -1,3 +1,16 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: llama-31-8b-sglang +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 30Gi + storageClassName: default # change this to your preferred storage class + volumeMode: Filesystem +--- apiVersion: node.k8s.io/v1 kind: RuntimeClass metadata: @@ -27,23 +40,36 @@ spec: containers: - name: meta-llama-31-8b-instruct-sglang image: docker.io/lmsysorg/sglang:latest - imagePullPolicy: Always # IfNotPresent or Never + imagePullPolicy: Always # IfNotPresent or Never ports: - containerPort: 30000 command: ["python3", "-m", "sglang.launch_server"] - args: ["--model-path", "meta-llama/Llama-3.1-8B-Instruct", "--host", "0.0.0.0", "--port", "30000"] + args: + [ + "--model-path", + "meta-llama/Llama-3.1-8B-Instruct", + "--host", + "0.0.0.0", + "--port", + "30000", + ] env: - name: HF_TOKEN value: resources: limits: nvidia.com/gpu: 1 + cpu: 8 + memory: 40Gi + requests: + cpu: 2 + memory: 16Gi + nvidia.com/gpu: 1 volumeMounts: - name: shm mountPath: /dev/shm - name: hf-cache mountPath: /root/.cache/huggingface - readOnly: true - name: localtime mountPath: /etc/localtime readOnly: true @@ -51,17 +77,27 @@ spec: httpGet: path: /health port: 30000 - initialDelaySeconds: 30 - periodSeconds: 10 + initialDelaySeconds: 120 + periodSeconds: 15 + timeoutSeconds: 10 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /health_generate + port: 30000 + initialDelaySeconds: 120 + periodSeconds: 15 + timeoutSeconds: 10 + failureThreshold: 3 + successThreshold: 1 volumes: - name: shm emptyDir: medium: Memory sizeLimit: 10Gi - name: hf-cache - hostPath: - path: /root/.cache/huggingface - type: Directory + persistentVolumeClaim: + claimName: llama-31-8b-sglang - name: localtime hostPath: path: /etc/localtime @@ -76,6 +112,6 @@ spec: app: meta-llama-31-8b-instruct-sglang ports: - protocol: TCP - port: 30000 # port on host - targetPort: 30000 # port in container - type: LoadBalancer + port: 80 # port on host + targetPort: 30000 # port in container + type: LoadBalancer # change to ClusterIP if needed