[Fix] Adjust default chunked prefill size and cuda graph max bs according to GPU memory capacity (#2044)

This commit is contained in:
Lianmin Zheng
2024-11-15 06:21:57 -08:00
committed by GitHub
parent c29b98e043
commit b01df48cf2
2 changed files with 50 additions and 3 deletions

View File

@@ -27,6 +27,7 @@ import resource
import shutil
import signal
import socket
import subprocess
import tempfile
import time
import warnings
@@ -791,3 +792,35 @@ def add_prometheus_middleware(app):
# Workaround for 307 Redirect for /metrics
metrics_route.path_regex = re.compile("^/metrics(?P<path>.*)$")
app.routes.append(metrics_route)
def get_gpu_memory_capacity():
try:
# Run nvidia-smi and capture the output
result = subprocess.run(
["nvidia-smi", "--query-gpu=memory.total", "--format=csv,noheader,nounits"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
if result.returncode != 0:
raise RuntimeError(f"nvidia-smi error: {result.stderr.strip()}")
# Parse the output to extract memory values
memory_values = [
float(mem)
for mem in result.stdout.strip().split("\n")
if re.match(r"^\d+(\.\d+)?$", mem.strip())
]
if not memory_values:
raise ValueError("No GPU memory values found.")
# Return the minimum memory value
return min(memory_values)
except FileNotFoundError:
raise RuntimeError(
"nvidia-smi not found. Ensure NVIDIA drivers are installed and accessible."
)