[Fix] Adjust default chunked prefill size and cuda graph max bs according to GPU memory capacity (#2044)
This commit is contained in:
@@ -27,6 +27,7 @@ import resource
|
||||
import shutil
|
||||
import signal
|
||||
import socket
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
import warnings
|
||||
@@ -791,3 +792,35 @@ def add_prometheus_middleware(app):
|
||||
# Workaround for 307 Redirect for /metrics
|
||||
metrics_route.path_regex = re.compile("^/metrics(?P<path>.*)$")
|
||||
app.routes.append(metrics_route)
|
||||
|
||||
|
||||
def get_gpu_memory_capacity():
|
||||
try:
|
||||
# Run nvidia-smi and capture the output
|
||||
result = subprocess.run(
|
||||
["nvidia-smi", "--query-gpu=memory.total", "--format=csv,noheader,nounits"],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"nvidia-smi error: {result.stderr.strip()}")
|
||||
|
||||
# Parse the output to extract memory values
|
||||
memory_values = [
|
||||
float(mem)
|
||||
for mem in result.stdout.strip().split("\n")
|
||||
if re.match(r"^\d+(\.\d+)?$", mem.strip())
|
||||
]
|
||||
|
||||
if not memory_values:
|
||||
raise ValueError("No GPU memory values found.")
|
||||
|
||||
# Return the minimum memory value
|
||||
return min(memory_values)
|
||||
|
||||
except FileNotFoundError:
|
||||
raise RuntimeError(
|
||||
"nvidia-smi not found. Ensure NVIDIA drivers are installed and accessible."
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user