Add PVC and update resource limits in k8s config (#8489)
This commit is contained in:
@@ -1,3 +1,16 @@
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: llama-31-8b-sglang
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteMany
|
||||
resources:
|
||||
requests:
|
||||
storage: 30Gi
|
||||
storageClassName: default # change this to your preferred storage class
|
||||
volumeMode: Filesystem
|
||||
---
|
||||
apiVersion: node.k8s.io/v1
|
||||
kind: RuntimeClass
|
||||
metadata:
|
||||
@@ -27,23 +40,36 @@ spec:
|
||||
containers:
|
||||
- name: meta-llama-31-8b-instruct-sglang
|
||||
image: docker.io/lmsysorg/sglang:latest
|
||||
imagePullPolicy: Always # IfNotPresent or Never
|
||||
imagePullPolicy: Always # IfNotPresent or Never
|
||||
ports:
|
||||
- containerPort: 30000
|
||||
command: ["python3", "-m", "sglang.launch_server"]
|
||||
args: ["--model-path", "meta-llama/Llama-3.1-8B-Instruct", "--host", "0.0.0.0", "--port", "30000"]
|
||||
args:
|
||||
[
|
||||
"--model-path",
|
||||
"meta-llama/Llama-3.1-8B-Instruct",
|
||||
"--host",
|
||||
"0.0.0.0",
|
||||
"--port",
|
||||
"30000",
|
||||
]
|
||||
env:
|
||||
- name: HF_TOKEN
|
||||
value: <secret>
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 1
|
||||
cpu: 8
|
||||
memory: 40Gi
|
||||
requests:
|
||||
cpu: 2
|
||||
memory: 16Gi
|
||||
nvidia.com/gpu: 1
|
||||
volumeMounts:
|
||||
- name: shm
|
||||
mountPath: /dev/shm
|
||||
- name: hf-cache
|
||||
mountPath: /root/.cache/huggingface
|
||||
readOnly: true
|
||||
- name: localtime
|
||||
mountPath: /etc/localtime
|
||||
readOnly: true
|
||||
@@ -51,17 +77,27 @@ spec:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 30000
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 10
|
||||
initialDelaySeconds: 120
|
||||
periodSeconds: 15
|
||||
timeoutSeconds: 10
|
||||
failureThreshold: 3
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /health_generate
|
||||
port: 30000
|
||||
initialDelaySeconds: 120
|
||||
periodSeconds: 15
|
||||
timeoutSeconds: 10
|
||||
failureThreshold: 3
|
||||
successThreshold: 1
|
||||
volumes:
|
||||
- name: shm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
sizeLimit: 10Gi
|
||||
- name: hf-cache
|
||||
hostPath:
|
||||
path: /root/.cache/huggingface
|
||||
type: Directory
|
||||
persistentVolumeClaim:
|
||||
claimName: llama-31-8b-sglang
|
||||
- name: localtime
|
||||
hostPath:
|
||||
path: /etc/localtime
|
||||
@@ -76,6 +112,6 @@ spec:
|
||||
app: meta-llama-31-8b-instruct-sglang
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 30000 # port on host
|
||||
targetPort: 30000 # port in container
|
||||
type: LoadBalancer
|
||||
port: 80 # port on host
|
||||
targetPort: 30000 # port in container
|
||||
type: LoadBalancer # change to ClusterIP if needed
|
||||
|
||||
Reference in New Issue
Block a user