diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index cd13f6e9a..4e7ab9075 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -99,8 +99,6 @@ jobs: needs: [check-changes, sgl-kernel-build-wheels] if: needs.check-changes.outputs.sgl_kernel == 'true' runs-on: 1-gpu-runner - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} steps: - uses: actions/checkout@v4 @@ -233,8 +231,6 @@ jobs: if: always() && !failure() && !cancelled() && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) runs-on: 1-gpu-runner - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} strategy: fail-fast: false matrix: @@ -266,8 +262,6 @@ jobs: if: always() && !failure() && !cancelled() && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) runs-on: 2-gpu-runner - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} strategy: fail-fast: false matrix: @@ -299,8 +293,6 @@ jobs: if: always() && !failure() && !cancelled() && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) runs-on: 4-gpu-runner - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} strategy: fail-fast: false matrix: @@ -332,8 +324,6 @@ jobs: if: always() && !failure() && !cancelled() && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) runs-on: 8-gpu-runner - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} strategy: fail-fast: false matrix: @@ -365,8 +355,6 @@ jobs: if: always() && !failure() && !cancelled() && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) runs-on: 1-gpu-runner - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} steps: - name: Checkout code uses: actions/checkout@v4 @@ -426,8 +414,6 @@ jobs: if: always() && !failure() && !cancelled() && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) runs-on: 1-gpu-runner - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} steps: - name: Checkout code uses: actions/checkout@v4 @@ -479,8 +465,6 @@ jobs: if: always() && !failure() && !cancelled() && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) runs-on: 2-gpu-runner - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} steps: - name: Checkout code uses: actions/checkout@v4 @@ -538,8 +522,6 @@ jobs: if: always() && !failure() && !cancelled() && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) runs-on: 1-gpu-runner - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} steps: - name: Checkout code uses: actions/checkout@v4 @@ -570,8 +552,6 @@ jobs: if: always() && !failure() && !cancelled() && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) runs-on: 2-gpu-runner - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} steps: - name: Checkout code uses: actions/checkout@v4 @@ -602,8 +582,6 @@ jobs: if: always() && !failure() && !cancelled() && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) runs-on: 4-gpu-runner - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} steps: - name: Checkout code uses: actions/checkout@v4 @@ -631,8 +609,6 @@ jobs: if: always() && !failure() && !cancelled() && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) runs-on: 8-gpu-runner - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} steps: - name: Checkout code uses: actions/checkout@v4 @@ -660,8 +636,6 @@ jobs: if: always() && !failure() && !cancelled() && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) runs-on: 4-b200-runner - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} strategy: fail-fast: false steps: diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py index b41d0e3b7..436d62f27 100644 --- a/python/sglang/srt/managers/io_struct.py +++ b/python/sglang/srt/managers/io_struct.py @@ -35,6 +35,7 @@ else: Image = Any +# Parameters for a session @dataclass class SessionParams: id: Optional[str] = None @@ -84,8 +85,6 @@ class GenerateReqInput: sampling_params: Optional[Union[List[Dict], Dict]] = None # The request id. rid: Optional[Union[List[str], str]] = None - # Extra key for classifying the request (e.g. cache_salt) - extra_key: Optional[Union[List[str], str]] = None # Whether to return logprobs. return_logprob: Optional[Union[List[bool], bool]] = None # If return logprobs, the start location in the prompt for returning logprobs. @@ -134,18 +133,23 @@ class GenerateReqInput: # Conversation id used for tracking requests conversation_id: Optional[str] = None - # (Deprecated, please use custom_labels) Label for the request - label: Optional[str] = None - # Priority for the request priority: Optional[int] = None - # Image gen grpc migration - return_bytes: bool = False + # Extra key for classifying the request (e.g. cache_salt) + extra_key: Optional[Union[List[str], str]] = None + + # Whether to disallow logging for this request (e.g. due to ZDR) + no_logs: bool = False # For custom metric labels custom_labels: Optional[Dict[str, str]] = None + # (Deprecated, please use custom_labels) Label for the request + label: Optional[str] = None + # (Internal) Whether to return bytes for image generation + return_bytes: bool = False + def contains_mm_input(self) -> bool: return ( has_valid_data(self.image_data) @@ -544,8 +548,11 @@ class GenerateReqInput: self.data_parallel_rank if self.data_parallel_rank is not None else None ), conversation_id=self.conversation_id, - label=self.label, priority=self.priority, + extra_key=self.extra_key, + no_logs=self.no_logs, + custom_labels=self.custom_labels, + label=self.label, return_bytes=self.return_bytes, ) @@ -602,21 +609,23 @@ class TokenizedGenerateReqInput: # For dp balance dp_balance_id: int = -1 - # Label for the request - label: Optional[str] = None - # Priority for the request priority: Optional[int] = None # Extra key for classifying the request (e.g. cache_salt) extra_key: Optional[str] = None - # Image gen grpc migration - return_bytes: bool = False + # Whether to disallow logging for this request (e.g. due to ZDR) + no_logs: bool = False # tracing context trace_context: Optional[Dict] = None + # (Deprecated, please use custom_labels) Label for the request + label: Optional[str] = None + # (Internal) Whether to return bytes for image generation + return_bytes: bool = False + @dataclass class BatchTokenizedGenerateReqInput: diff --git a/python/sglang/srt/model_loader/weight_utils.py b/python/sglang/srt/model_loader/weight_utils.py index 0e7089bfc..44297d687 100644 --- a/python/sglang/srt/model_loader/weight_utils.py +++ b/python/sglang/srt/model_loader/weight_utils.py @@ -242,11 +242,8 @@ def find_local_hf_snapshot_dir( allow_patterns: List[str], revision: Optional[str] = None, ) -> Optional[str]: - """If the weights are already local, skip downloading and returns the path - - Only applied in ci - """ - if not is_in_ci() or os.path.isdir(model_name_or_path): + """If the weights are already local, skip downloading and returns the path.""" + if os.path.isdir(model_name_or_path): return None found_local_snapshot_dir = None @@ -347,11 +344,14 @@ def download_weights_from_hf( str: The path to the downloaded model weights. """ - path = find_local_hf_snapshot_dir( - model_name_or_path, cache_dir, allow_patterns, revision - ) - if path is not None: - return path + if is_in_ci(): + # If the weights are already local, skip downloading and returns the path. + # This is used to skip too-many Huggingface API calls in CI. + path = find_local_hf_snapshot_dir( + model_name_or_path, cache_dir, allow_patterns, revision + ) + if path is not None: + return path if not huggingface_hub.constants.HF_HUB_OFFLINE: # Before we download we look at that is available: diff --git a/python/sglang/srt/models/qwen3.py b/python/sglang/srt/models/qwen3.py index a7551bb82..32bda876a 100644 --- a/python/sglang/srt/models/qwen3.py +++ b/python/sglang/srt/models/qwen3.py @@ -1,6 +1,5 @@ # Adapted from qwen2.py import logging -from functools import partial from typing import Any, Dict, Iterable, List, Optional, Tuple import torch diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index bbe96fc9b..50c674480 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -523,57 +523,134 @@ class ServerArgs: def _handle_gpu_memory_settings(self, gpu_mem): """ - Configure GPU memory-dependent settings including mem_fraction_static, - chunked_prefill_size, cuda_graph_max_bs, and cuda_graph_bs. + Configure GPU memory-dependent settings including + chunked_prefill_size, cuda_graph_max_bs, and mem_fraction_static. + + Here are our heuristics: + - Set chunked_prefill_size and cuda_graph_max_bs based on the GPU memory capacity. + This is because GPUs with more memory are generally more powerful, we need to use a larger + chunked_prefill_size and a larger cuda_graph_max_bs to fully utilize the GPU. + - Then set mem_fraction_static based on chunked_prefill_size and cuda_graph_max_bs. + + GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers + + The argument mem_fraction_static is defined as (model weights + KV cache pool) / GPU memory capacity, + or equivalently, mem_fraction_static = (GPU memory capacity - activations - cuda graph buffers) / GPU memory capacity. + + In order to compute mem_fraction_static, we need to estimate the size of activations and cuda graph buffers. + The activation memory is proportional to the chunked_prefill_size. + The cuda graph memory is proportional to the cuda_graph_max_bs. + We use reserved_mem = chunked_prefill_size * 1.5 + cuda_graph_max_bs * 2 to estimate the size of activations and cuda graph buffers in GB. + and set mem_fraction_static = (GPU memory capacity - reserved_mem) / GPU memory capacity. + + The coefficient 1.5 is a heuristic value, in the future, we can do better estimation by looking at the model types, hidden sizes or even do a dummy run. """ - # Set mem fraction static - if self.mem_fraction_static is None: - if gpu_mem is not None: - # GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers - # mem_fraction_static = (model weights + KV cache pool) / GPU memory capacity. - - # We want mem_fraction_static to be as large as possible but still has enough room - # for activations and cuda graph buffers. We use the following heuristic to - # compute the needed size for activations and cuda graph buffers: - # - The size of the activation depends on the chunked_prefill_size and model size. - # - The size of cuda graph buffers depends on the cuda graph capture range and model size. - # For GPUs with more memory, we use a larger chunked_prefill_size and - # capture more cuda graphs, so they need to reserve more memory. - parallel_size = self.tp_size * self.pp_size - - if gpu_mem < 20 * 1024: - # T4, 4080. (chunked_prefill_size 2k, cuda_graph_max_bs 8) - reserved_mem = (2.8 + parallel_size / 10) * 1024 - elif gpu_mem < 50 * 1024: - # A10, L40, 4090, 5090. (chunked_prefill_size 2k, cuda_graph_max_bs 16 if tp < 4 else 80) - reserved_mem = (2.8 + parallel_size / 10) * 1024 - elif gpu_mem < 90 * 1024: - # H100, A100. (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512) - reserved_mem = (12 + parallel_size / 2) * 1024 - elif gpu_mem < 100 * 1024: - # H20. (chunked_prefill_size 8k, cuda_graph_max_bs 512) - reserved_mem = (15 + parallel_size / 2) * 1024 - elif gpu_mem < 160 * 1024: - # H200. (chunked_prefill_size 8k, cuda_graph_max_bs 512) - reserved_mem = (15 + parallel_size / 2) * 1024 - else: - # B200, MI300. (chunked_prefill_size 16k, cuda_graph_max_bs 512) - reserved_mem = 32 * 1024 - - # draft model and larger cuda graph buffers - if self.speculative_algorithm is not None: - if self.speculative_algorithm == "STANDALONE": - # Standalone speculative decoding needs more memory than other speculative - # decoding algorithms since the draft model is typically larger. - reserved_mem += 6 * 1024 - elif self.speculative_algorithm != "NGRAM": - reserved_mem += 2 * 1024 - if self.enable_dp_attention: - reserved_mem += 4 * 1024 - - self.mem_fraction_static = round((gpu_mem - reserved_mem) / gpu_mem, 3) + if gpu_mem is not None: + if gpu_mem < 20 * 1024: + # T4, 4080 + # (chunked_prefill_size 2k, cuda_graph_max_bs 8) + if self.chunked_prefill_size is None: + self.chunked_prefill_size = 2048 + if self.cuda_graph_max_bs is None: + self.cuda_graph_max_bs = 8 + elif gpu_mem < 35 * 1024: + # A10, 4090, 5090 + # (chunked_prefill_size 2k, cuda_graph_max_bs 16 if tp < 4 else 80) + if self.chunked_prefill_size is None: + self.chunked_prefill_size = 2048 + if self.cuda_graph_max_bs is None: + # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM < 35GB, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. + # However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs + # from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues. + if self.tp_size < 4: + self.cuda_graph_max_bs = 16 + else: + self.cuda_graph_max_bs = 80 + elif gpu_mem < 60 * 1024: + # A100 (40GB), L40, + # (chunked_prefill_size 4k, cuda_graph_max_bs 32 if tp < 4 else 160) + if self.chunked_prefill_size is None: + self.chunked_prefill_size = 4096 + if self.cuda_graph_max_bs is None: + if self.tp_size < 4: + self.cuda_graph_max_bs = 32 + else: + self.cuda_graph_max_bs = 160 + elif gpu_mem < 90 * 1024: + # H100, A100 + # (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512) + if self.chunked_prefill_size is None: + self.chunked_prefill_size = 8192 + if self.cuda_graph_max_bs is None: + if self.tp_size < 4: + self.cuda_graph_max_bs = 256 + else: + self.cuda_graph_max_bs = 512 + elif gpu_mem < 160 * 1024: + # H20, H200 + # (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512) + if self.chunked_prefill_size is None: + self.chunked_prefill_size = 8192 + if self.cuda_graph_max_bs is None: + if self.tp_size < 4: + self.cuda_graph_max_bs = 256 + else: + self.cuda_graph_max_bs = 512 else: - self.mem_fraction_static = 0.88 + # B200, MI300 + # (chunked_prefill_size 16k, cuda_graph_max_bs 512) + if self.chunked_prefill_size is None: + self.chunked_prefill_size = 16384 + if self.cuda_graph_max_bs is None: + self.cuda_graph_max_bs = 512 + else: + # Fallback defaults when gpu_mem is None + if self.chunked_prefill_size is None: + self.chunked_prefill_size = 4096 + if self.cuda_graph_max_bs is None: + self.cuda_graph_max_bs = 160 + + # Set cuda graph batch sizes + if self.cuda_graph_bs is None: + self.cuda_graph_bs = self._generate_cuda_graph_batch_sizes() + else: + self.cuda_graph_max_bs = max(self.cuda_graph_bs) + + if self.mem_fraction_static is None: + # Constant meta data (e.g., from attention backend) + reserved_mem = 1024 + # For activation during large prefill + if self.chunked_prefill_size > 0: + reserved_mem += max(self.chunked_prefill_size, 2048) * 1.5 + else: + reserved_mem += max(self.max_prefill_tokens, 2048) * 1.5 + # For cuda graphs + reserved_mem += self.cuda_graph_max_bs * 2 + # Some adjustments for large parallel size + reserved_mem += self.tp_size * self.pp_size / 4 * 1024 + + if self.enable_dp_attention: + # DP attention needs more padding for some operations + reserved_mem += self.cuda_graph_max_bs * self.dp_size * 3 + + # DP attention uses much more memory for large cuda graph max bs, + # likely due to some inefficiencies in torch allocator or our implementation. + # So we need to reserve more memory. + if self.cuda_graph_max_bs > 300: + reserved_mem += self.cuda_graph_max_bs * self.dp_size * 1.5 + + if gpu_mem > 60 * 1024: + reserved_mem = max(reserved_mem, 10 * 1024) + + if self.speculative_algorithm is not None: + if self.speculative_algorithm == "STANDALONE": + # standalonedraft model and cuda graphs + reserved_mem += 6 * 1024 + elif self.speculative_algorithm != "NGRAM": + # eagle draft models and cuda graphs + reserved_mem += 2 * 1024 + + self.mem_fraction_static = round((gpu_mem - reserved_mem) / gpu_mem, 3) # Lazy init to avoid circular import # Multimodal models need more memory for the image processor @@ -583,49 +660,6 @@ class ServerArgs: if model_config.is_multimodal: self.adjust_mem_fraction_for_vlm(model_config) - # Set chunked prefill size, which depends on the gpu memory capacity - if self.chunked_prefill_size is None: - if gpu_mem is not None: - if gpu_mem < 50 * 1024: # T4, 4080, A10, L40, 4090, 5090 - self.chunked_prefill_size = 2048 - elif gpu_mem < 160 * 1024: # H100, H200, A100, H20 - self.chunked_prefill_size = 8192 - else: # B200, MI300 - self.chunked_prefill_size = 16384 - else: - self.chunked_prefill_size = 4096 - - # Set cuda graph max batch size and cuda graph batch sizes - if self.cuda_graph_max_bs is None: - if gpu_mem is not None: - if gpu_mem < 20 * 1024: - # T4, 4080 - self.cuda_graph_max_bs = 8 - elif gpu_mem < 50 * 1024: - # A10, L40, 4090, 5090 - # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. - # However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs - # from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues. - if self.tp_size < 4: - self.cuda_graph_max_bs = 16 - else: - self.cuda_graph_max_bs = 80 - elif gpu_mem < 90 * 1024: - # H100, A100 - if self.tp_size < 4: - self.cuda_graph_max_bs = 256 - else: - self.cuda_graph_max_bs = 512 - else: - # H20, H200, B200, MI300 - self.cuda_graph_max_bs = 512 - else: - # Default fallback - self.cuda_graph_max_bs = 160 - - if self.cuda_graph_bs is None: - self.cuda_graph_bs = self._generate_cuda_graph_batch_sizes() - def _generate_cuda_graph_batch_sizes(self): """ Generate the list of batch sizes for CUDA graph capture based on cuda_graph_max_bs. diff --git a/test/srt/lora/test_lora_llama4.py b/test/srt/lora/test_lora_llama4.py index 65a4b766f..c4a8695fc 100644 --- a/test/srt/lora/test_lora_llama4.py +++ b/test/srt/lora/test_lora_llama4.py @@ -38,7 +38,7 @@ class TestLlama4LoRA(CustomTestCase): "--tp-size", str(model.tp_size), "--context-length", - "1048576", + "262144", "--attention-backend", "fa3", ], diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 5dbb7cfb7..11837c172 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -13,6 +13,7 @@ class TestFile: suites = { "per-commit": [ + TestFile("function_call/test_json_schema_constraint.py", 30), TestFile("hicache/test_hicache.py", 116), TestFile("hicache/test_hicache_mla.py", 127), TestFile("hicache/test_hicache_storage.py", 127), @@ -20,11 +21,9 @@ suites = { TestFile("lora/test_lora_eviction.py", 200), TestFile("lora/test_lora_backend.py", 99), TestFile("lora/test_multi_lora_backend.py", 60), - TestFile("lora/test_lora_cuda_graph.py", 250), TestFile("lora/test_lora_update.py", 400), TestFile("lora/test_lora_qwen3.py", 97), TestFile("lora/test_lora_radix_cache.py", 100), - TestFile("lora/test_chunked_sgmv_backend.py", 30), TestFile("models/test_embedding_models.py", 73), # TestFile("models/test_clip_models.py", 52), TestFile("models/test_encoder_embedding_models.py", 100), @@ -51,7 +50,6 @@ suites = { TestFile("openai_server/features/test_reasoning_content.py", 89), TestFile("openai_server/function_call/test_openai_function_calling.py", 60), TestFile("openai_server/function_call/test_tool_choice.py", 226), - TestFile("function_call/test_json_schema_constraint.py", 30), TestFile("openai_server/validation/test_large_max_new_tokens.py", 41), TestFile("openai_server/validation/test_matched_stop.py", 60), TestFile("openai_server/validation/test_openai_server_ignore_eos.py", 85), @@ -144,8 +142,6 @@ suites = { TestFile("test_multi_instance_release_memory_occupation.py", 64), ], "per-commit-8-gpu": [ - # Disabled because it hangs on the CI. - # TestFile("ep/test_moe_ep.py", 181), TestFile("hicache/test_hicache_storage_mooncake_backend.py", 800), TestFile("lora/test_lora_llama4.py", 600), TestFile("test_disaggregation.py", 499), diff --git a/test/srt/test_mla_deepseek_v3.py b/test/srt/test_mla_deepseek_v3.py index 634100fdb..4e9e99ce5 100644 --- a/test/srt/test_mla_deepseek_v3.py +++ b/test/srt/test_mla_deepseek_v3.py @@ -3,7 +3,6 @@ import unittest from types import SimpleNamespace import requests -import torch from sglang.srt.utils import is_cuda, is_hip, kill_process_tree from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k @@ -11,6 +10,7 @@ from sglang.test.test_utils import ( DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST, CustomTestCase, + is_in_ci, popen_launch_server, ) @@ -50,6 +50,7 @@ class TestMLADeepseekV3(CustomTestCase): self.assertGreater(metrics["accuracy"], 0.62) +@unittest.skipIf(is_in_ci(), "To reduce the CI execution time.") class TestMLADeepseekV3DisableFusedFunc(CustomTestCase): @classmethod def setUpClass(cls): diff --git a/test/srt/test_multi_instance_release_memory_occupation.py b/test/srt/test_multi_instance_release_memory_occupation.py index e4e8d9081..be2ff002d 100644 --- a/test/srt/test_multi_instance_release_memory_occupation.py +++ b/test/srt/test_multi_instance_release_memory_occupation.py @@ -1,6 +1,6 @@ import multiprocessing import os -import subprocess +import time import traceback import unittest from multiprocessing import Process @@ -21,7 +21,7 @@ from sglang.test.test_utils import ( TEST_SUITE = dict( model_path=DEFAULT_SMALL_MODEL_NAME_FOR_TEST, - mem_fraction_static=0.85, + mem_fraction_static=0.83, dp_size=2, tp_size=2, ) @@ -214,6 +214,9 @@ def _run_sglang_subprocess( _mem_usage = get_gpu_memory_gb(rank) print(f"GPU{rank} Memory usage after resuming Sgl weights: {_mem_usage}") del hf_model + hf_model = None + torch.cuda.empty_cache() + time.sleep(5) torch.cuda.empty_cache() _curr_usage = get_gpu_memory_gb(rank) assert (