port fp8 mixtral (#460)

2024-05-21 11:46:35 -07:00
parent 19d2135cb8
commit 0fafc5606b
6 changed files with 633 additions and 118 deletions
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -106,6 +106,7 @@ def get_available_gpu_memory(gpu_id, distributed=True):
            "which may cause useless memory allocation for torch CUDA context.",
        )

+    torch.cuda.empty_cache()
    free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id)

    if distributed: