Improve error handling (#433)

2024-05-12 20:49:04 -07:00
parent 04c0b21488
commit 562b8857d8
6 changed files with 92 additions and 41 deletions
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -4,9 +4,7 @@ import base64
 import os
 import random
 import socket
-import sys
 import time
-import traceback
 from importlib.metadata import PackageNotFoundError, version
 from io import BytesIO
 from typing import List, Optional
@@ -20,6 +18,8 @@ from packaging import version as pkg_version
 from pydantic import BaseModel
 from starlette.middleware.base import BaseHTTPMiddleware

+from sglang.utils import get_exception_traceback
+
 show_time_cost = False
 time_infos = {}

@@ -90,6 +90,32 @@ def calculate_time(show=False, min_cost_ms=0.0):
    return wrapper


+def get_available_gpu_memory(gpu_id, distributed=True):
+    """
+    Get available memory for cuda:gpu_id device.
+    When distributed is True, the available memory is the minimum available memory of all GPUs.
+    """
+    num_gpus = torch.cuda.device_count()
+    assert gpu_id < num_gpus
+
+    if torch.cuda.current_device() != gpu_id:
+        print(
+            f"WARNING: current device is not {gpu_id}, but {torch.cuda.current_device()}, ",
+            "which may cause useless memory allocation for torch CUDA context.",
+        )
+
+    free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id)
+
+    if distributed:
+        tensor = torch.tensor(free_gpu_memory, dtype=torch.float32).to(
+            torch.device("cuda", gpu_id)
+        )
+        torch.distributed.all_reduce(tensor, op=torch.distributed.ReduceOp.MIN)
+        free_gpu_memory = tensor.item()
+
+    return free_gpu_memory / (1 << 30)
+
+
 def set_random_seed(seed: int) -> None:
    random.seed(seed)

@@ -158,12 +184,6 @@ def allocate_init_ports(
    return port, additional_ports


-def get_exception_traceback():
-    etype, value, tb = sys.exc_info()
-    err_str = "".join(traceback.format_exception(etype, value, tb))
-    return err_str
-
-
 def get_int_token_logit_bias(tokenizer, vocab_size):
    # a bug when model's vocab size > tokenizer.vocab_size
    vocab_size = tokenizer.vocab_size
@@ -314,4 +334,4 @@ IS_PYDANTIC_1 = int(pydantic.VERSION.split(".")[0]) == 1
 def jsonify_pydantic_model(obj: BaseModel):
    if IS_PYDANTIC_1:
        return obj.json(ensure_ascii=False)
-    return obj.model_dump_json()
+    return obj.model_dump_json()