diff --git a/docker/k8s-sglang-service.yaml b/docker/k8s-sglang-service.yaml index 820746c80..6f6f9af0b 100644 --- a/docker/k8s-sglang-service.yaml +++ b/docker/k8s-sglang-service.yaml @@ -39,6 +39,8 @@ spec: limits: nvidia.com/gpu: 1 volumeMounts: + - name: shm + mountPath: /dev/shm - name: hf-cache mountPath: /root/.cache/huggingface readOnly: true @@ -52,6 +54,10 @@ spec: initialDelaySeconds: 30 periodSeconds: 10 volumes: + - name: shm + emptyDir: + medium: Memory + sizeLimit: 10Gi - name: hf-cache hostPath: path: /root/.cache/huggingface diff --git a/docs/backend/server_arguments.md b/docs/backend/server_arguments.md index c2e81eafe..2027c082f 100644 --- a/docs/backend/server_arguments.md +++ b/docs/backend/server_arguments.md @@ -21,6 +21,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct ``` - See [hyperparameter tuning](hyperparameter_tuning.md) on tuning hyperparameters for better performance. +- For docker and Kubernetes runs, you need to set up shared memory which is used for communication between processes. See `--shm-size` for docker and `/dev/shm` size update for Kubernetes manifests. - If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size. ```bash