Add PVC and update resource limits in k8s config (#8489)
This commit is contained in:
@@ -1,3 +1,16 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: PersistentVolumeClaim
|
||||||
|
metadata:
|
||||||
|
name: llama-31-8b-sglang
|
||||||
|
spec:
|
||||||
|
accessModes:
|
||||||
|
- ReadWriteMany
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 30Gi
|
||||||
|
storageClassName: default # change this to your preferred storage class
|
||||||
|
volumeMode: Filesystem
|
||||||
|
---
|
||||||
apiVersion: node.k8s.io/v1
|
apiVersion: node.k8s.io/v1
|
||||||
kind: RuntimeClass
|
kind: RuntimeClass
|
||||||
metadata:
|
metadata:
|
||||||
@@ -27,23 +40,36 @@ spec:
|
|||||||
containers:
|
containers:
|
||||||
- name: meta-llama-31-8b-instruct-sglang
|
- name: meta-llama-31-8b-instruct-sglang
|
||||||
image: docker.io/lmsysorg/sglang:latest
|
image: docker.io/lmsysorg/sglang:latest
|
||||||
imagePullPolicy: Always # IfNotPresent or Never
|
imagePullPolicy: Always # IfNotPresent or Never
|
||||||
ports:
|
ports:
|
||||||
- containerPort: 30000
|
- containerPort: 30000
|
||||||
command: ["python3", "-m", "sglang.launch_server"]
|
command: ["python3", "-m", "sglang.launch_server"]
|
||||||
args: ["--model-path", "meta-llama/Llama-3.1-8B-Instruct", "--host", "0.0.0.0", "--port", "30000"]
|
args:
|
||||||
|
[
|
||||||
|
"--model-path",
|
||||||
|
"meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"--host",
|
||||||
|
"0.0.0.0",
|
||||||
|
"--port",
|
||||||
|
"30000",
|
||||||
|
]
|
||||||
env:
|
env:
|
||||||
- name: HF_TOKEN
|
- name: HF_TOKEN
|
||||||
value: <secret>
|
value: <secret>
|
||||||
resources:
|
resources:
|
||||||
limits:
|
limits:
|
||||||
nvidia.com/gpu: 1
|
nvidia.com/gpu: 1
|
||||||
|
cpu: 8
|
||||||
|
memory: 40Gi
|
||||||
|
requests:
|
||||||
|
cpu: 2
|
||||||
|
memory: 16Gi
|
||||||
|
nvidia.com/gpu: 1
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- name: shm
|
- name: shm
|
||||||
mountPath: /dev/shm
|
mountPath: /dev/shm
|
||||||
- name: hf-cache
|
- name: hf-cache
|
||||||
mountPath: /root/.cache/huggingface
|
mountPath: /root/.cache/huggingface
|
||||||
readOnly: true
|
|
||||||
- name: localtime
|
- name: localtime
|
||||||
mountPath: /etc/localtime
|
mountPath: /etc/localtime
|
||||||
readOnly: true
|
readOnly: true
|
||||||
@@ -51,17 +77,27 @@ spec:
|
|||||||
httpGet:
|
httpGet:
|
||||||
path: /health
|
path: /health
|
||||||
port: 30000
|
port: 30000
|
||||||
initialDelaySeconds: 30
|
initialDelaySeconds: 120
|
||||||
periodSeconds: 10
|
periodSeconds: 15
|
||||||
|
timeoutSeconds: 10
|
||||||
|
failureThreshold: 3
|
||||||
|
readinessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /health_generate
|
||||||
|
port: 30000
|
||||||
|
initialDelaySeconds: 120
|
||||||
|
periodSeconds: 15
|
||||||
|
timeoutSeconds: 10
|
||||||
|
failureThreshold: 3
|
||||||
|
successThreshold: 1
|
||||||
volumes:
|
volumes:
|
||||||
- name: shm
|
- name: shm
|
||||||
emptyDir:
|
emptyDir:
|
||||||
medium: Memory
|
medium: Memory
|
||||||
sizeLimit: 10Gi
|
sizeLimit: 10Gi
|
||||||
- name: hf-cache
|
- name: hf-cache
|
||||||
hostPath:
|
persistentVolumeClaim:
|
||||||
path: /root/.cache/huggingface
|
claimName: llama-31-8b-sglang
|
||||||
type: Directory
|
|
||||||
- name: localtime
|
- name: localtime
|
||||||
hostPath:
|
hostPath:
|
||||||
path: /etc/localtime
|
path: /etc/localtime
|
||||||
@@ -76,6 +112,6 @@ spec:
|
|||||||
app: meta-llama-31-8b-instruct-sglang
|
app: meta-llama-31-8b-instruct-sglang
|
||||||
ports:
|
ports:
|
||||||
- protocol: TCP
|
- protocol: TCP
|
||||||
port: 30000 # port on host
|
port: 80 # port on host
|
||||||
targetPort: 30000 # port in container
|
targetPort: 30000 # port in container
|
||||||
type: LoadBalancer
|
type: LoadBalancer # change to ClusterIP if needed
|
||||||
|
|||||||
Reference in New Issue
Block a user