Use more general heuristics to set the default value of --mem-fraction-static (#10975)

Co-authored-by: sglang-bot <sglangbot@gmail.com>
This commit is contained in:
Lianmin Zheng
2025-09-29 10:11:03 -07:00
committed by GitHub
parent 816b3a433a
commit a17e70f5cc
9 changed files with 167 additions and 151 deletions

View File

@@ -99,8 +99,6 @@ jobs:
needs: [check-changes, sgl-kernel-build-wheels] needs: [check-changes, sgl-kernel-build-wheels]
if: needs.check-changes.outputs.sgl_kernel == 'true' if: needs.check-changes.outputs.sgl_kernel == 'true'
runs-on: 1-gpu-runner runs-on: 1-gpu-runner
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
@@ -233,8 +231,6 @@ jobs:
if: always() && !failure() && !cancelled() && if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 1-gpu-runner runs-on: 1-gpu-runner
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
@@ -266,8 +262,6 @@ jobs:
if: always() && !failure() && !cancelled() && if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 2-gpu-runner runs-on: 2-gpu-runner
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
@@ -299,8 +293,6 @@ jobs:
if: always() && !failure() && !cancelled() && if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 4-gpu-runner runs-on: 4-gpu-runner
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
@@ -332,8 +324,6 @@ jobs:
if: always() && !failure() && !cancelled() && if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 8-gpu-runner runs-on: 8-gpu-runner
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
@@ -365,8 +355,6 @@ jobs:
if: always() && !failure() && !cancelled() && if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 1-gpu-runner runs-on: 1-gpu-runner
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
steps: steps:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
@@ -426,8 +414,6 @@ jobs:
if: always() && !failure() && !cancelled() && if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 1-gpu-runner runs-on: 1-gpu-runner
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
steps: steps:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
@@ -479,8 +465,6 @@ jobs:
if: always() && !failure() && !cancelled() && if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 2-gpu-runner runs-on: 2-gpu-runner
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
steps: steps:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
@@ -538,8 +522,6 @@ jobs:
if: always() && !failure() && !cancelled() && if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 1-gpu-runner runs-on: 1-gpu-runner
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
steps: steps:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
@@ -570,8 +552,6 @@ jobs:
if: always() && !failure() && !cancelled() && if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 2-gpu-runner runs-on: 2-gpu-runner
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
steps: steps:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
@@ -602,8 +582,6 @@ jobs:
if: always() && !failure() && !cancelled() && if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 4-gpu-runner runs-on: 4-gpu-runner
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
steps: steps:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
@@ -631,8 +609,6 @@ jobs:
if: always() && !failure() && !cancelled() && if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 8-gpu-runner runs-on: 8-gpu-runner
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
steps: steps:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
@@ -660,8 +636,6 @@ jobs:
if: always() && !failure() && !cancelled() && if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 4-b200-runner runs-on: 4-b200-runner
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
strategy: strategy:
fail-fast: false fail-fast: false
steps: steps:

View File

@@ -35,6 +35,7 @@ else:
Image = Any Image = Any
# Parameters for a session
@dataclass @dataclass
class SessionParams: class SessionParams:
id: Optional[str] = None id: Optional[str] = None
@@ -84,8 +85,6 @@ class GenerateReqInput:
sampling_params: Optional[Union[List[Dict], Dict]] = None sampling_params: Optional[Union[List[Dict], Dict]] = None
# The request id. # The request id.
rid: Optional[Union[List[str], str]] = None rid: Optional[Union[List[str], str]] = None
# Extra key for classifying the request (e.g. cache_salt)
extra_key: Optional[Union[List[str], str]] = None
# Whether to return logprobs. # Whether to return logprobs.
return_logprob: Optional[Union[List[bool], bool]] = None return_logprob: Optional[Union[List[bool], bool]] = None
# If return logprobs, the start location in the prompt for returning logprobs. # If return logprobs, the start location in the prompt for returning logprobs.
@@ -134,18 +133,23 @@ class GenerateReqInput:
# Conversation id used for tracking requests # Conversation id used for tracking requests
conversation_id: Optional[str] = None conversation_id: Optional[str] = None
# (Deprecated, please use custom_labels) Label for the request
label: Optional[str] = None
# Priority for the request # Priority for the request
priority: Optional[int] = None priority: Optional[int] = None
# Image gen grpc migration # Extra key for classifying the request (e.g. cache_salt)
return_bytes: bool = False extra_key: Optional[Union[List[str], str]] = None
# Whether to disallow logging for this request (e.g. due to ZDR)
no_logs: bool = False
# For custom metric labels # For custom metric labels
custom_labels: Optional[Dict[str, str]] = None custom_labels: Optional[Dict[str, str]] = None
# (Deprecated, please use custom_labels) Label for the request
label: Optional[str] = None
# (Internal) Whether to return bytes for image generation
return_bytes: bool = False
def contains_mm_input(self) -> bool: def contains_mm_input(self) -> bool:
return ( return (
has_valid_data(self.image_data) has_valid_data(self.image_data)
@@ -544,8 +548,11 @@ class GenerateReqInput:
self.data_parallel_rank if self.data_parallel_rank is not None else None self.data_parallel_rank if self.data_parallel_rank is not None else None
), ),
conversation_id=self.conversation_id, conversation_id=self.conversation_id,
label=self.label,
priority=self.priority, priority=self.priority,
extra_key=self.extra_key,
no_logs=self.no_logs,
custom_labels=self.custom_labels,
label=self.label,
return_bytes=self.return_bytes, return_bytes=self.return_bytes,
) )
@@ -602,21 +609,23 @@ class TokenizedGenerateReqInput:
# For dp balance # For dp balance
dp_balance_id: int = -1 dp_balance_id: int = -1
# Label for the request
label: Optional[str] = None
# Priority for the request # Priority for the request
priority: Optional[int] = None priority: Optional[int] = None
# Extra key for classifying the request (e.g. cache_salt) # Extra key for classifying the request (e.g. cache_salt)
extra_key: Optional[str] = None extra_key: Optional[str] = None
# Image gen grpc migration # Whether to disallow logging for this request (e.g. due to ZDR)
return_bytes: bool = False no_logs: bool = False
# tracing context # tracing context
trace_context: Optional[Dict] = None trace_context: Optional[Dict] = None
# (Deprecated, please use custom_labels) Label for the request
label: Optional[str] = None
# (Internal) Whether to return bytes for image generation
return_bytes: bool = False
@dataclass @dataclass
class BatchTokenizedGenerateReqInput: class BatchTokenizedGenerateReqInput:

View File

@@ -242,11 +242,8 @@ def find_local_hf_snapshot_dir(
allow_patterns: List[str], allow_patterns: List[str],
revision: Optional[str] = None, revision: Optional[str] = None,
) -> Optional[str]: ) -> Optional[str]:
"""If the weights are already local, skip downloading and returns the path """If the weights are already local, skip downloading and returns the path."""
if os.path.isdir(model_name_or_path):
Only applied in ci
"""
if not is_in_ci() or os.path.isdir(model_name_or_path):
return None return None
found_local_snapshot_dir = None found_local_snapshot_dir = None
@@ -347,11 +344,14 @@ def download_weights_from_hf(
str: The path to the downloaded model weights. str: The path to the downloaded model weights.
""" """
path = find_local_hf_snapshot_dir( if is_in_ci():
model_name_or_path, cache_dir, allow_patterns, revision # If the weights are already local, skip downloading and returns the path.
) # This is used to skip too-many Huggingface API calls in CI.
if path is not None: path = find_local_hf_snapshot_dir(
return path model_name_or_path, cache_dir, allow_patterns, revision
)
if path is not None:
return path
if not huggingface_hub.constants.HF_HUB_OFFLINE: if not huggingface_hub.constants.HF_HUB_OFFLINE:
# Before we download we look at that is available: # Before we download we look at that is available:

View File

@@ -1,6 +1,5 @@
# Adapted from qwen2.py # Adapted from qwen2.py
import logging import logging
from functools import partial
from typing import Any, Dict, Iterable, List, Optional, Tuple from typing import Any, Dict, Iterable, List, Optional, Tuple
import torch import torch

View File

@@ -523,57 +523,134 @@ class ServerArgs:
def _handle_gpu_memory_settings(self, gpu_mem): def _handle_gpu_memory_settings(self, gpu_mem):
""" """
Configure GPU memory-dependent settings including mem_fraction_static, Configure GPU memory-dependent settings including
chunked_prefill_size, cuda_graph_max_bs, and cuda_graph_bs. chunked_prefill_size, cuda_graph_max_bs, and mem_fraction_static.
Here are our heuristics:
- Set chunked_prefill_size and cuda_graph_max_bs based on the GPU memory capacity.
This is because GPUs with more memory are generally more powerful, we need to use a larger
chunked_prefill_size and a larger cuda_graph_max_bs to fully utilize the GPU.
- Then set mem_fraction_static based on chunked_prefill_size and cuda_graph_max_bs.
GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers
The argument mem_fraction_static is defined as (model weights + KV cache pool) / GPU memory capacity,
or equivalently, mem_fraction_static = (GPU memory capacity - activations - cuda graph buffers) / GPU memory capacity.
In order to compute mem_fraction_static, we need to estimate the size of activations and cuda graph buffers.
The activation memory is proportional to the chunked_prefill_size.
The cuda graph memory is proportional to the cuda_graph_max_bs.
We use reserved_mem = chunked_prefill_size * 1.5 + cuda_graph_max_bs * 2 to estimate the size of activations and cuda graph buffers in GB.
and set mem_fraction_static = (GPU memory capacity - reserved_mem) / GPU memory capacity.
The coefficient 1.5 is a heuristic value, in the future, we can do better estimation by looking at the model types, hidden sizes or even do a dummy run.
""" """
# Set mem fraction static if gpu_mem is not None:
if self.mem_fraction_static is None: if gpu_mem < 20 * 1024:
if gpu_mem is not None: # T4, 4080
# GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers # (chunked_prefill_size 2k, cuda_graph_max_bs 8)
# mem_fraction_static = (model weights + KV cache pool) / GPU memory capacity. if self.chunked_prefill_size is None:
self.chunked_prefill_size = 2048
# We want mem_fraction_static to be as large as possible but still has enough room if self.cuda_graph_max_bs is None:
# for activations and cuda graph buffers. We use the following heuristic to self.cuda_graph_max_bs = 8
# compute the needed size for activations and cuda graph buffers: elif gpu_mem < 35 * 1024:
# - The size of the activation depends on the chunked_prefill_size and model size. # A10, 4090, 5090
# - The size of cuda graph buffers depends on the cuda graph capture range and model size. # (chunked_prefill_size 2k, cuda_graph_max_bs 16 if tp < 4 else 80)
# For GPUs with more memory, we use a larger chunked_prefill_size and if self.chunked_prefill_size is None:
# capture more cuda graphs, so they need to reserve more memory. self.chunked_prefill_size = 2048
parallel_size = self.tp_size * self.pp_size if self.cuda_graph_max_bs is None:
# Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM < 35GB, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance.
if gpu_mem < 20 * 1024: # However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs
# T4, 4080. (chunked_prefill_size 2k, cuda_graph_max_bs 8) # from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
reserved_mem = (2.8 + parallel_size / 10) * 1024 if self.tp_size < 4:
elif gpu_mem < 50 * 1024: self.cuda_graph_max_bs = 16
# A10, L40, 4090, 5090. (chunked_prefill_size 2k, cuda_graph_max_bs 16 if tp < 4 else 80) else:
reserved_mem = (2.8 + parallel_size / 10) * 1024 self.cuda_graph_max_bs = 80
elif gpu_mem < 90 * 1024: elif gpu_mem < 60 * 1024:
# H100, A100. (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512) # A100 (40GB), L40,
reserved_mem = (12 + parallel_size / 2) * 1024 # (chunked_prefill_size 4k, cuda_graph_max_bs 32 if tp < 4 else 160)
elif gpu_mem < 100 * 1024: if self.chunked_prefill_size is None:
# H20. (chunked_prefill_size 8k, cuda_graph_max_bs 512) self.chunked_prefill_size = 4096
reserved_mem = (15 + parallel_size / 2) * 1024 if self.cuda_graph_max_bs is None:
elif gpu_mem < 160 * 1024: if self.tp_size < 4:
# H200. (chunked_prefill_size 8k, cuda_graph_max_bs 512) self.cuda_graph_max_bs = 32
reserved_mem = (15 + parallel_size / 2) * 1024 else:
else: self.cuda_graph_max_bs = 160
# B200, MI300. (chunked_prefill_size 16k, cuda_graph_max_bs 512) elif gpu_mem < 90 * 1024:
reserved_mem = 32 * 1024 # H100, A100
# (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
# draft model and larger cuda graph buffers if self.chunked_prefill_size is None:
if self.speculative_algorithm is not None: self.chunked_prefill_size = 8192
if self.speculative_algorithm == "STANDALONE": if self.cuda_graph_max_bs is None:
# Standalone speculative decoding needs more memory than other speculative if self.tp_size < 4:
# decoding algorithms since the draft model is typically larger. self.cuda_graph_max_bs = 256
reserved_mem += 6 * 1024 else:
elif self.speculative_algorithm != "NGRAM": self.cuda_graph_max_bs = 512
reserved_mem += 2 * 1024 elif gpu_mem < 160 * 1024:
if self.enable_dp_attention: # H20, H200
reserved_mem += 4 * 1024 # (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
if self.chunked_prefill_size is None:
self.mem_fraction_static = round((gpu_mem - reserved_mem) / gpu_mem, 3) self.chunked_prefill_size = 8192
if self.cuda_graph_max_bs is None:
if self.tp_size < 4:
self.cuda_graph_max_bs = 256
else:
self.cuda_graph_max_bs = 512
else: else:
self.mem_fraction_static = 0.88 # B200, MI300
# (chunked_prefill_size 16k, cuda_graph_max_bs 512)
if self.chunked_prefill_size is None:
self.chunked_prefill_size = 16384
if self.cuda_graph_max_bs is None:
self.cuda_graph_max_bs = 512
else:
# Fallback defaults when gpu_mem is None
if self.chunked_prefill_size is None:
self.chunked_prefill_size = 4096
if self.cuda_graph_max_bs is None:
self.cuda_graph_max_bs = 160
# Set cuda graph batch sizes
if self.cuda_graph_bs is None:
self.cuda_graph_bs = self._generate_cuda_graph_batch_sizes()
else:
self.cuda_graph_max_bs = max(self.cuda_graph_bs)
if self.mem_fraction_static is None:
# Constant meta data (e.g., from attention backend)
reserved_mem = 1024
# For activation during large prefill
if self.chunked_prefill_size > 0:
reserved_mem += max(self.chunked_prefill_size, 2048) * 1.5
else:
reserved_mem += max(self.max_prefill_tokens, 2048) * 1.5
# For cuda graphs
reserved_mem += self.cuda_graph_max_bs * 2
# Some adjustments for large parallel size
reserved_mem += self.tp_size * self.pp_size / 4 * 1024
if self.enable_dp_attention:
# DP attention needs more padding for some operations
reserved_mem += self.cuda_graph_max_bs * self.dp_size * 3
# DP attention uses much more memory for large cuda graph max bs,
# likely due to some inefficiencies in torch allocator or our implementation.
# So we need to reserve more memory.
if self.cuda_graph_max_bs > 300:
reserved_mem += self.cuda_graph_max_bs * self.dp_size * 1.5
if gpu_mem > 60 * 1024:
reserved_mem = max(reserved_mem, 10 * 1024)
if self.speculative_algorithm is not None:
if self.speculative_algorithm == "STANDALONE":
# standalonedraft model and cuda graphs
reserved_mem += 6 * 1024
elif self.speculative_algorithm != "NGRAM":
# eagle draft models and cuda graphs
reserved_mem += 2 * 1024
self.mem_fraction_static = round((gpu_mem - reserved_mem) / gpu_mem, 3)
# Lazy init to avoid circular import # Lazy init to avoid circular import
# Multimodal models need more memory for the image processor # Multimodal models need more memory for the image processor
@@ -583,49 +660,6 @@ class ServerArgs:
if model_config.is_multimodal: if model_config.is_multimodal:
self.adjust_mem_fraction_for_vlm(model_config) self.adjust_mem_fraction_for_vlm(model_config)
# Set chunked prefill size, which depends on the gpu memory capacity
if self.chunked_prefill_size is None:
if gpu_mem is not None:
if gpu_mem < 50 * 1024: # T4, 4080, A10, L40, 4090, 5090
self.chunked_prefill_size = 2048
elif gpu_mem < 160 * 1024: # H100, H200, A100, H20
self.chunked_prefill_size = 8192
else: # B200, MI300
self.chunked_prefill_size = 16384
else:
self.chunked_prefill_size = 4096
# Set cuda graph max batch size and cuda graph batch sizes
if self.cuda_graph_max_bs is None:
if gpu_mem is not None:
if gpu_mem < 20 * 1024:
# T4, 4080
self.cuda_graph_max_bs = 8
elif gpu_mem < 50 * 1024:
# A10, L40, 4090, 5090
# Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance.
# However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs
# from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
if self.tp_size < 4:
self.cuda_graph_max_bs = 16
else:
self.cuda_graph_max_bs = 80
elif gpu_mem < 90 * 1024:
# H100, A100
if self.tp_size < 4:
self.cuda_graph_max_bs = 256
else:
self.cuda_graph_max_bs = 512
else:
# H20, H200, B200, MI300
self.cuda_graph_max_bs = 512
else:
# Default fallback
self.cuda_graph_max_bs = 160
if self.cuda_graph_bs is None:
self.cuda_graph_bs = self._generate_cuda_graph_batch_sizes()
def _generate_cuda_graph_batch_sizes(self): def _generate_cuda_graph_batch_sizes(self):
""" """
Generate the list of batch sizes for CUDA graph capture based on cuda_graph_max_bs. Generate the list of batch sizes for CUDA graph capture based on cuda_graph_max_bs.

View File

@@ -38,7 +38,7 @@ class TestLlama4LoRA(CustomTestCase):
"--tp-size", "--tp-size",
str(model.tp_size), str(model.tp_size),
"--context-length", "--context-length",
"1048576", "262144",
"--attention-backend", "--attention-backend",
"fa3", "fa3",
], ],

View File

@@ -13,6 +13,7 @@ class TestFile:
suites = { suites = {
"per-commit": [ "per-commit": [
TestFile("function_call/test_json_schema_constraint.py", 30),
TestFile("hicache/test_hicache.py", 116), TestFile("hicache/test_hicache.py", 116),
TestFile("hicache/test_hicache_mla.py", 127), TestFile("hicache/test_hicache_mla.py", 127),
TestFile("hicache/test_hicache_storage.py", 127), TestFile("hicache/test_hicache_storage.py", 127),
@@ -20,11 +21,9 @@ suites = {
TestFile("lora/test_lora_eviction.py", 200), TestFile("lora/test_lora_eviction.py", 200),
TestFile("lora/test_lora_backend.py", 99), TestFile("lora/test_lora_backend.py", 99),
TestFile("lora/test_multi_lora_backend.py", 60), TestFile("lora/test_multi_lora_backend.py", 60),
TestFile("lora/test_lora_cuda_graph.py", 250),
TestFile("lora/test_lora_update.py", 400), TestFile("lora/test_lora_update.py", 400),
TestFile("lora/test_lora_qwen3.py", 97), TestFile("lora/test_lora_qwen3.py", 97),
TestFile("lora/test_lora_radix_cache.py", 100), TestFile("lora/test_lora_radix_cache.py", 100),
TestFile("lora/test_chunked_sgmv_backend.py", 30),
TestFile("models/test_embedding_models.py", 73), TestFile("models/test_embedding_models.py", 73),
# TestFile("models/test_clip_models.py", 52), # TestFile("models/test_clip_models.py", 52),
TestFile("models/test_encoder_embedding_models.py", 100), TestFile("models/test_encoder_embedding_models.py", 100),
@@ -51,7 +50,6 @@ suites = {
TestFile("openai_server/features/test_reasoning_content.py", 89), TestFile("openai_server/features/test_reasoning_content.py", 89),
TestFile("openai_server/function_call/test_openai_function_calling.py", 60), TestFile("openai_server/function_call/test_openai_function_calling.py", 60),
TestFile("openai_server/function_call/test_tool_choice.py", 226), TestFile("openai_server/function_call/test_tool_choice.py", 226),
TestFile("function_call/test_json_schema_constraint.py", 30),
TestFile("openai_server/validation/test_large_max_new_tokens.py", 41), TestFile("openai_server/validation/test_large_max_new_tokens.py", 41),
TestFile("openai_server/validation/test_matched_stop.py", 60), TestFile("openai_server/validation/test_matched_stop.py", 60),
TestFile("openai_server/validation/test_openai_server_ignore_eos.py", 85), TestFile("openai_server/validation/test_openai_server_ignore_eos.py", 85),
@@ -144,8 +142,6 @@ suites = {
TestFile("test_multi_instance_release_memory_occupation.py", 64), TestFile("test_multi_instance_release_memory_occupation.py", 64),
], ],
"per-commit-8-gpu": [ "per-commit-8-gpu": [
# Disabled because it hangs on the CI.
# TestFile("ep/test_moe_ep.py", 181),
TestFile("hicache/test_hicache_storage_mooncake_backend.py", 800), TestFile("hicache/test_hicache_storage_mooncake_backend.py", 800),
TestFile("lora/test_lora_llama4.py", 600), TestFile("lora/test_lora_llama4.py", 600),
TestFile("test_disaggregation.py", 499), TestFile("test_disaggregation.py", 499),

View File

@@ -3,7 +3,6 @@ import unittest
from types import SimpleNamespace from types import SimpleNamespace
import requests import requests
import torch
from sglang.srt.utils import is_cuda, is_hip, kill_process_tree from sglang.srt.utils import is_cuda, is_hip, kill_process_tree
from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
@@ -11,6 +10,7 @@ from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST, DEFAULT_URL_FOR_TEST,
CustomTestCase, CustomTestCase,
is_in_ci,
popen_launch_server, popen_launch_server,
) )
@@ -50,6 +50,7 @@ class TestMLADeepseekV3(CustomTestCase):
self.assertGreater(metrics["accuracy"], 0.62) self.assertGreater(metrics["accuracy"], 0.62)
@unittest.skipIf(is_in_ci(), "To reduce the CI execution time.")
class TestMLADeepseekV3DisableFusedFunc(CustomTestCase): class TestMLADeepseekV3DisableFusedFunc(CustomTestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):

View File

@@ -1,6 +1,6 @@
import multiprocessing import multiprocessing
import os import os
import subprocess import time
import traceback import traceback
import unittest import unittest
from multiprocessing import Process from multiprocessing import Process
@@ -21,7 +21,7 @@ from sglang.test.test_utils import (
TEST_SUITE = dict( TEST_SUITE = dict(
model_path=DEFAULT_SMALL_MODEL_NAME_FOR_TEST, model_path=DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
mem_fraction_static=0.85, mem_fraction_static=0.83,
dp_size=2, dp_size=2,
tp_size=2, tp_size=2,
) )
@@ -214,6 +214,9 @@ def _run_sglang_subprocess(
_mem_usage = get_gpu_memory_gb(rank) _mem_usage = get_gpu_memory_gb(rank)
print(f"GPU{rank} Memory usage after resuming Sgl weights: {_mem_usage}") print(f"GPU{rank} Memory usage after resuming Sgl weights: {_mem_usage}")
del hf_model del hf_model
hf_model = None
torch.cuda.empty_cache()
time.sleep(5)
torch.cuda.empty_cache() torch.cuda.empty_cache()
_curr_usage = get_gpu_memory_gb(rank) _curr_usage = get_gpu_memory_gb(rank)
assert ( assert (