Use more general heuristics to set the default value of --mem-fraction-static (#10975)
Co-authored-by: sglang-bot <sglangbot@gmail.com>
This commit is contained in:
26
.github/workflows/pr-test.yml
vendored
26
.github/workflows/pr-test.yml
vendored
@@ -99,8 +99,6 @@ jobs:
|
|||||||
needs: [check-changes, sgl-kernel-build-wheels]
|
needs: [check-changes, sgl-kernel-build-wheels]
|
||||||
if: needs.check-changes.outputs.sgl_kernel == 'true'
|
if: needs.check-changes.outputs.sgl_kernel == 'true'
|
||||||
runs-on: 1-gpu-runner
|
runs-on: 1-gpu-runner
|
||||||
env:
|
|
||||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
@@ -233,8 +231,6 @@ jobs:
|
|||||||
if: always() && !failure() && !cancelled() &&
|
if: always() && !failure() && !cancelled() &&
|
||||||
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
||||||
runs-on: 1-gpu-runner
|
runs-on: 1-gpu-runner
|
||||||
env:
|
|
||||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
@@ -266,8 +262,6 @@ jobs:
|
|||||||
if: always() && !failure() && !cancelled() &&
|
if: always() && !failure() && !cancelled() &&
|
||||||
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
||||||
runs-on: 2-gpu-runner
|
runs-on: 2-gpu-runner
|
||||||
env:
|
|
||||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
@@ -299,8 +293,6 @@ jobs:
|
|||||||
if: always() && !failure() && !cancelled() &&
|
if: always() && !failure() && !cancelled() &&
|
||||||
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
||||||
runs-on: 4-gpu-runner
|
runs-on: 4-gpu-runner
|
||||||
env:
|
|
||||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
@@ -332,8 +324,6 @@ jobs:
|
|||||||
if: always() && !failure() && !cancelled() &&
|
if: always() && !failure() && !cancelled() &&
|
||||||
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
||||||
runs-on: 8-gpu-runner
|
runs-on: 8-gpu-runner
|
||||||
env:
|
|
||||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
@@ -365,8 +355,6 @@ jobs:
|
|||||||
if: always() && !failure() && !cancelled() &&
|
if: always() && !failure() && !cancelled() &&
|
||||||
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
||||||
runs-on: 1-gpu-runner
|
runs-on: 1-gpu-runner
|
||||||
env:
|
|
||||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
@@ -426,8 +414,6 @@ jobs:
|
|||||||
if: always() && !failure() && !cancelled() &&
|
if: always() && !failure() && !cancelled() &&
|
||||||
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
||||||
runs-on: 1-gpu-runner
|
runs-on: 1-gpu-runner
|
||||||
env:
|
|
||||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
@@ -479,8 +465,6 @@ jobs:
|
|||||||
if: always() && !failure() && !cancelled() &&
|
if: always() && !failure() && !cancelled() &&
|
||||||
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
||||||
runs-on: 2-gpu-runner
|
runs-on: 2-gpu-runner
|
||||||
env:
|
|
||||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
@@ -538,8 +522,6 @@ jobs:
|
|||||||
if: always() && !failure() && !cancelled() &&
|
if: always() && !failure() && !cancelled() &&
|
||||||
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
||||||
runs-on: 1-gpu-runner
|
runs-on: 1-gpu-runner
|
||||||
env:
|
|
||||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
@@ -570,8 +552,6 @@ jobs:
|
|||||||
if: always() && !failure() && !cancelled() &&
|
if: always() && !failure() && !cancelled() &&
|
||||||
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
||||||
runs-on: 2-gpu-runner
|
runs-on: 2-gpu-runner
|
||||||
env:
|
|
||||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
@@ -602,8 +582,6 @@ jobs:
|
|||||||
if: always() && !failure() && !cancelled() &&
|
if: always() && !failure() && !cancelled() &&
|
||||||
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
||||||
runs-on: 4-gpu-runner
|
runs-on: 4-gpu-runner
|
||||||
env:
|
|
||||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
@@ -631,8 +609,6 @@ jobs:
|
|||||||
if: always() && !failure() && !cancelled() &&
|
if: always() && !failure() && !cancelled() &&
|
||||||
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
||||||
runs-on: 8-gpu-runner
|
runs-on: 8-gpu-runner
|
||||||
env:
|
|
||||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
@@ -660,8 +636,6 @@ jobs:
|
|||||||
if: always() && !failure() && !cancelled() &&
|
if: always() && !failure() && !cancelled() &&
|
||||||
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
||||||
runs-on: 4-b200-runner
|
runs-on: 4-b200-runner
|
||||||
env:
|
|
||||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
steps:
|
steps:
|
||||||
|
|||||||
@@ -35,6 +35,7 @@ else:
|
|||||||
Image = Any
|
Image = Any
|
||||||
|
|
||||||
|
|
||||||
|
# Parameters for a session
|
||||||
@dataclass
|
@dataclass
|
||||||
class SessionParams:
|
class SessionParams:
|
||||||
id: Optional[str] = None
|
id: Optional[str] = None
|
||||||
@@ -84,8 +85,6 @@ class GenerateReqInput:
|
|||||||
sampling_params: Optional[Union[List[Dict], Dict]] = None
|
sampling_params: Optional[Union[List[Dict], Dict]] = None
|
||||||
# The request id.
|
# The request id.
|
||||||
rid: Optional[Union[List[str], str]] = None
|
rid: Optional[Union[List[str], str]] = None
|
||||||
# Extra key for classifying the request (e.g. cache_salt)
|
|
||||||
extra_key: Optional[Union[List[str], str]] = None
|
|
||||||
# Whether to return logprobs.
|
# Whether to return logprobs.
|
||||||
return_logprob: Optional[Union[List[bool], bool]] = None
|
return_logprob: Optional[Union[List[bool], bool]] = None
|
||||||
# If return logprobs, the start location in the prompt for returning logprobs.
|
# If return logprobs, the start location in the prompt for returning logprobs.
|
||||||
@@ -134,18 +133,23 @@ class GenerateReqInput:
|
|||||||
# Conversation id used for tracking requests
|
# Conversation id used for tracking requests
|
||||||
conversation_id: Optional[str] = None
|
conversation_id: Optional[str] = None
|
||||||
|
|
||||||
# (Deprecated, please use custom_labels) Label for the request
|
|
||||||
label: Optional[str] = None
|
|
||||||
|
|
||||||
# Priority for the request
|
# Priority for the request
|
||||||
priority: Optional[int] = None
|
priority: Optional[int] = None
|
||||||
|
|
||||||
# Image gen grpc migration
|
# Extra key for classifying the request (e.g. cache_salt)
|
||||||
return_bytes: bool = False
|
extra_key: Optional[Union[List[str], str]] = None
|
||||||
|
|
||||||
|
# Whether to disallow logging for this request (e.g. due to ZDR)
|
||||||
|
no_logs: bool = False
|
||||||
|
|
||||||
# For custom metric labels
|
# For custom metric labels
|
||||||
custom_labels: Optional[Dict[str, str]] = None
|
custom_labels: Optional[Dict[str, str]] = None
|
||||||
|
|
||||||
|
# (Deprecated, please use custom_labels) Label for the request
|
||||||
|
label: Optional[str] = None
|
||||||
|
# (Internal) Whether to return bytes for image generation
|
||||||
|
return_bytes: bool = False
|
||||||
|
|
||||||
def contains_mm_input(self) -> bool:
|
def contains_mm_input(self) -> bool:
|
||||||
return (
|
return (
|
||||||
has_valid_data(self.image_data)
|
has_valid_data(self.image_data)
|
||||||
@@ -544,8 +548,11 @@ class GenerateReqInput:
|
|||||||
self.data_parallel_rank if self.data_parallel_rank is not None else None
|
self.data_parallel_rank if self.data_parallel_rank is not None else None
|
||||||
),
|
),
|
||||||
conversation_id=self.conversation_id,
|
conversation_id=self.conversation_id,
|
||||||
label=self.label,
|
|
||||||
priority=self.priority,
|
priority=self.priority,
|
||||||
|
extra_key=self.extra_key,
|
||||||
|
no_logs=self.no_logs,
|
||||||
|
custom_labels=self.custom_labels,
|
||||||
|
label=self.label,
|
||||||
return_bytes=self.return_bytes,
|
return_bytes=self.return_bytes,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -602,21 +609,23 @@ class TokenizedGenerateReqInput:
|
|||||||
# For dp balance
|
# For dp balance
|
||||||
dp_balance_id: int = -1
|
dp_balance_id: int = -1
|
||||||
|
|
||||||
# Label for the request
|
|
||||||
label: Optional[str] = None
|
|
||||||
|
|
||||||
# Priority for the request
|
# Priority for the request
|
||||||
priority: Optional[int] = None
|
priority: Optional[int] = None
|
||||||
|
|
||||||
# Extra key for classifying the request (e.g. cache_salt)
|
# Extra key for classifying the request (e.g. cache_salt)
|
||||||
extra_key: Optional[str] = None
|
extra_key: Optional[str] = None
|
||||||
|
|
||||||
# Image gen grpc migration
|
# Whether to disallow logging for this request (e.g. due to ZDR)
|
||||||
return_bytes: bool = False
|
no_logs: bool = False
|
||||||
|
|
||||||
# tracing context
|
# tracing context
|
||||||
trace_context: Optional[Dict] = None
|
trace_context: Optional[Dict] = None
|
||||||
|
|
||||||
|
# (Deprecated, please use custom_labels) Label for the request
|
||||||
|
label: Optional[str] = None
|
||||||
|
# (Internal) Whether to return bytes for image generation
|
||||||
|
return_bytes: bool = False
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class BatchTokenizedGenerateReqInput:
|
class BatchTokenizedGenerateReqInput:
|
||||||
|
|||||||
@@ -242,11 +242,8 @@ def find_local_hf_snapshot_dir(
|
|||||||
allow_patterns: List[str],
|
allow_patterns: List[str],
|
||||||
revision: Optional[str] = None,
|
revision: Optional[str] = None,
|
||||||
) -> Optional[str]:
|
) -> Optional[str]:
|
||||||
"""If the weights are already local, skip downloading and returns the path
|
"""If the weights are already local, skip downloading and returns the path."""
|
||||||
|
if os.path.isdir(model_name_or_path):
|
||||||
Only applied in ci
|
|
||||||
"""
|
|
||||||
if not is_in_ci() or os.path.isdir(model_name_or_path):
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
found_local_snapshot_dir = None
|
found_local_snapshot_dir = None
|
||||||
@@ -347,11 +344,14 @@ def download_weights_from_hf(
|
|||||||
str: The path to the downloaded model weights.
|
str: The path to the downloaded model weights.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
path = find_local_hf_snapshot_dir(
|
if is_in_ci():
|
||||||
model_name_or_path, cache_dir, allow_patterns, revision
|
# If the weights are already local, skip downloading and returns the path.
|
||||||
)
|
# This is used to skip too-many Huggingface API calls in CI.
|
||||||
if path is not None:
|
path = find_local_hf_snapshot_dir(
|
||||||
return path
|
model_name_or_path, cache_dir, allow_patterns, revision
|
||||||
|
)
|
||||||
|
if path is not None:
|
||||||
|
return path
|
||||||
|
|
||||||
if not huggingface_hub.constants.HF_HUB_OFFLINE:
|
if not huggingface_hub.constants.HF_HUB_OFFLINE:
|
||||||
# Before we download we look at that is available:
|
# Before we download we look at that is available:
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
# Adapted from qwen2.py
|
# Adapted from qwen2.py
|
||||||
import logging
|
import logging
|
||||||
from functools import partial
|
|
||||||
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|||||||
@@ -523,57 +523,134 @@ class ServerArgs:
|
|||||||
|
|
||||||
def _handle_gpu_memory_settings(self, gpu_mem):
|
def _handle_gpu_memory_settings(self, gpu_mem):
|
||||||
"""
|
"""
|
||||||
Configure GPU memory-dependent settings including mem_fraction_static,
|
Configure GPU memory-dependent settings including
|
||||||
chunked_prefill_size, cuda_graph_max_bs, and cuda_graph_bs.
|
chunked_prefill_size, cuda_graph_max_bs, and mem_fraction_static.
|
||||||
|
|
||||||
|
Here are our heuristics:
|
||||||
|
- Set chunked_prefill_size and cuda_graph_max_bs based on the GPU memory capacity.
|
||||||
|
This is because GPUs with more memory are generally more powerful, we need to use a larger
|
||||||
|
chunked_prefill_size and a larger cuda_graph_max_bs to fully utilize the GPU.
|
||||||
|
- Then set mem_fraction_static based on chunked_prefill_size and cuda_graph_max_bs.
|
||||||
|
|
||||||
|
GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers
|
||||||
|
|
||||||
|
The argument mem_fraction_static is defined as (model weights + KV cache pool) / GPU memory capacity,
|
||||||
|
or equivalently, mem_fraction_static = (GPU memory capacity - activations - cuda graph buffers) / GPU memory capacity.
|
||||||
|
|
||||||
|
In order to compute mem_fraction_static, we need to estimate the size of activations and cuda graph buffers.
|
||||||
|
The activation memory is proportional to the chunked_prefill_size.
|
||||||
|
The cuda graph memory is proportional to the cuda_graph_max_bs.
|
||||||
|
We use reserved_mem = chunked_prefill_size * 1.5 + cuda_graph_max_bs * 2 to estimate the size of activations and cuda graph buffers in GB.
|
||||||
|
and set mem_fraction_static = (GPU memory capacity - reserved_mem) / GPU memory capacity.
|
||||||
|
|
||||||
|
The coefficient 1.5 is a heuristic value, in the future, we can do better estimation by looking at the model types, hidden sizes or even do a dummy run.
|
||||||
"""
|
"""
|
||||||
# Set mem fraction static
|
if gpu_mem is not None:
|
||||||
if self.mem_fraction_static is None:
|
if gpu_mem < 20 * 1024:
|
||||||
if gpu_mem is not None:
|
# T4, 4080
|
||||||
# GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers
|
# (chunked_prefill_size 2k, cuda_graph_max_bs 8)
|
||||||
# mem_fraction_static = (model weights + KV cache pool) / GPU memory capacity.
|
if self.chunked_prefill_size is None:
|
||||||
|
self.chunked_prefill_size = 2048
|
||||||
# We want mem_fraction_static to be as large as possible but still has enough room
|
if self.cuda_graph_max_bs is None:
|
||||||
# for activations and cuda graph buffers. We use the following heuristic to
|
self.cuda_graph_max_bs = 8
|
||||||
# compute the needed size for activations and cuda graph buffers:
|
elif gpu_mem < 35 * 1024:
|
||||||
# - The size of the activation depends on the chunked_prefill_size and model size.
|
# A10, 4090, 5090
|
||||||
# - The size of cuda graph buffers depends on the cuda graph capture range and model size.
|
# (chunked_prefill_size 2k, cuda_graph_max_bs 16 if tp < 4 else 80)
|
||||||
# For GPUs with more memory, we use a larger chunked_prefill_size and
|
if self.chunked_prefill_size is None:
|
||||||
# capture more cuda graphs, so they need to reserve more memory.
|
self.chunked_prefill_size = 2048
|
||||||
parallel_size = self.tp_size * self.pp_size
|
if self.cuda_graph_max_bs is None:
|
||||||
|
# Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM < 35GB, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance.
|
||||||
if gpu_mem < 20 * 1024:
|
# However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs
|
||||||
# T4, 4080. (chunked_prefill_size 2k, cuda_graph_max_bs 8)
|
# from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
|
||||||
reserved_mem = (2.8 + parallel_size / 10) * 1024
|
if self.tp_size < 4:
|
||||||
elif gpu_mem < 50 * 1024:
|
self.cuda_graph_max_bs = 16
|
||||||
# A10, L40, 4090, 5090. (chunked_prefill_size 2k, cuda_graph_max_bs 16 if tp < 4 else 80)
|
else:
|
||||||
reserved_mem = (2.8 + parallel_size / 10) * 1024
|
self.cuda_graph_max_bs = 80
|
||||||
elif gpu_mem < 90 * 1024:
|
elif gpu_mem < 60 * 1024:
|
||||||
# H100, A100. (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
|
# A100 (40GB), L40,
|
||||||
reserved_mem = (12 + parallel_size / 2) * 1024
|
# (chunked_prefill_size 4k, cuda_graph_max_bs 32 if tp < 4 else 160)
|
||||||
elif gpu_mem < 100 * 1024:
|
if self.chunked_prefill_size is None:
|
||||||
# H20. (chunked_prefill_size 8k, cuda_graph_max_bs 512)
|
self.chunked_prefill_size = 4096
|
||||||
reserved_mem = (15 + parallel_size / 2) * 1024
|
if self.cuda_graph_max_bs is None:
|
||||||
elif gpu_mem < 160 * 1024:
|
if self.tp_size < 4:
|
||||||
# H200. (chunked_prefill_size 8k, cuda_graph_max_bs 512)
|
self.cuda_graph_max_bs = 32
|
||||||
reserved_mem = (15 + parallel_size / 2) * 1024
|
else:
|
||||||
else:
|
self.cuda_graph_max_bs = 160
|
||||||
# B200, MI300. (chunked_prefill_size 16k, cuda_graph_max_bs 512)
|
elif gpu_mem < 90 * 1024:
|
||||||
reserved_mem = 32 * 1024
|
# H100, A100
|
||||||
|
# (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
|
||||||
# draft model and larger cuda graph buffers
|
if self.chunked_prefill_size is None:
|
||||||
if self.speculative_algorithm is not None:
|
self.chunked_prefill_size = 8192
|
||||||
if self.speculative_algorithm == "STANDALONE":
|
if self.cuda_graph_max_bs is None:
|
||||||
# Standalone speculative decoding needs more memory than other speculative
|
if self.tp_size < 4:
|
||||||
# decoding algorithms since the draft model is typically larger.
|
self.cuda_graph_max_bs = 256
|
||||||
reserved_mem += 6 * 1024
|
else:
|
||||||
elif self.speculative_algorithm != "NGRAM":
|
self.cuda_graph_max_bs = 512
|
||||||
reserved_mem += 2 * 1024
|
elif gpu_mem < 160 * 1024:
|
||||||
if self.enable_dp_attention:
|
# H20, H200
|
||||||
reserved_mem += 4 * 1024
|
# (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
|
||||||
|
if self.chunked_prefill_size is None:
|
||||||
self.mem_fraction_static = round((gpu_mem - reserved_mem) / gpu_mem, 3)
|
self.chunked_prefill_size = 8192
|
||||||
|
if self.cuda_graph_max_bs is None:
|
||||||
|
if self.tp_size < 4:
|
||||||
|
self.cuda_graph_max_bs = 256
|
||||||
|
else:
|
||||||
|
self.cuda_graph_max_bs = 512
|
||||||
else:
|
else:
|
||||||
self.mem_fraction_static = 0.88
|
# B200, MI300
|
||||||
|
# (chunked_prefill_size 16k, cuda_graph_max_bs 512)
|
||||||
|
if self.chunked_prefill_size is None:
|
||||||
|
self.chunked_prefill_size = 16384
|
||||||
|
if self.cuda_graph_max_bs is None:
|
||||||
|
self.cuda_graph_max_bs = 512
|
||||||
|
else:
|
||||||
|
# Fallback defaults when gpu_mem is None
|
||||||
|
if self.chunked_prefill_size is None:
|
||||||
|
self.chunked_prefill_size = 4096
|
||||||
|
if self.cuda_graph_max_bs is None:
|
||||||
|
self.cuda_graph_max_bs = 160
|
||||||
|
|
||||||
|
# Set cuda graph batch sizes
|
||||||
|
if self.cuda_graph_bs is None:
|
||||||
|
self.cuda_graph_bs = self._generate_cuda_graph_batch_sizes()
|
||||||
|
else:
|
||||||
|
self.cuda_graph_max_bs = max(self.cuda_graph_bs)
|
||||||
|
|
||||||
|
if self.mem_fraction_static is None:
|
||||||
|
# Constant meta data (e.g., from attention backend)
|
||||||
|
reserved_mem = 1024
|
||||||
|
# For activation during large prefill
|
||||||
|
if self.chunked_prefill_size > 0:
|
||||||
|
reserved_mem += max(self.chunked_prefill_size, 2048) * 1.5
|
||||||
|
else:
|
||||||
|
reserved_mem += max(self.max_prefill_tokens, 2048) * 1.5
|
||||||
|
# For cuda graphs
|
||||||
|
reserved_mem += self.cuda_graph_max_bs * 2
|
||||||
|
# Some adjustments for large parallel size
|
||||||
|
reserved_mem += self.tp_size * self.pp_size / 4 * 1024
|
||||||
|
|
||||||
|
if self.enable_dp_attention:
|
||||||
|
# DP attention needs more padding for some operations
|
||||||
|
reserved_mem += self.cuda_graph_max_bs * self.dp_size * 3
|
||||||
|
|
||||||
|
# DP attention uses much more memory for large cuda graph max bs,
|
||||||
|
# likely due to some inefficiencies in torch allocator or our implementation.
|
||||||
|
# So we need to reserve more memory.
|
||||||
|
if self.cuda_graph_max_bs > 300:
|
||||||
|
reserved_mem += self.cuda_graph_max_bs * self.dp_size * 1.5
|
||||||
|
|
||||||
|
if gpu_mem > 60 * 1024:
|
||||||
|
reserved_mem = max(reserved_mem, 10 * 1024)
|
||||||
|
|
||||||
|
if self.speculative_algorithm is not None:
|
||||||
|
if self.speculative_algorithm == "STANDALONE":
|
||||||
|
# standalonedraft model and cuda graphs
|
||||||
|
reserved_mem += 6 * 1024
|
||||||
|
elif self.speculative_algorithm != "NGRAM":
|
||||||
|
# eagle draft models and cuda graphs
|
||||||
|
reserved_mem += 2 * 1024
|
||||||
|
|
||||||
|
self.mem_fraction_static = round((gpu_mem - reserved_mem) / gpu_mem, 3)
|
||||||
|
|
||||||
# Lazy init to avoid circular import
|
# Lazy init to avoid circular import
|
||||||
# Multimodal models need more memory for the image processor
|
# Multimodal models need more memory for the image processor
|
||||||
@@ -583,49 +660,6 @@ class ServerArgs:
|
|||||||
if model_config.is_multimodal:
|
if model_config.is_multimodal:
|
||||||
self.adjust_mem_fraction_for_vlm(model_config)
|
self.adjust_mem_fraction_for_vlm(model_config)
|
||||||
|
|
||||||
# Set chunked prefill size, which depends on the gpu memory capacity
|
|
||||||
if self.chunked_prefill_size is None:
|
|
||||||
if gpu_mem is not None:
|
|
||||||
if gpu_mem < 50 * 1024: # T4, 4080, A10, L40, 4090, 5090
|
|
||||||
self.chunked_prefill_size = 2048
|
|
||||||
elif gpu_mem < 160 * 1024: # H100, H200, A100, H20
|
|
||||||
self.chunked_prefill_size = 8192
|
|
||||||
else: # B200, MI300
|
|
||||||
self.chunked_prefill_size = 16384
|
|
||||||
else:
|
|
||||||
self.chunked_prefill_size = 4096
|
|
||||||
|
|
||||||
# Set cuda graph max batch size and cuda graph batch sizes
|
|
||||||
if self.cuda_graph_max_bs is None:
|
|
||||||
if gpu_mem is not None:
|
|
||||||
if gpu_mem < 20 * 1024:
|
|
||||||
# T4, 4080
|
|
||||||
self.cuda_graph_max_bs = 8
|
|
||||||
elif gpu_mem < 50 * 1024:
|
|
||||||
# A10, L40, 4090, 5090
|
|
||||||
# Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance.
|
|
||||||
# However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs
|
|
||||||
# from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
|
|
||||||
if self.tp_size < 4:
|
|
||||||
self.cuda_graph_max_bs = 16
|
|
||||||
else:
|
|
||||||
self.cuda_graph_max_bs = 80
|
|
||||||
elif gpu_mem < 90 * 1024:
|
|
||||||
# H100, A100
|
|
||||||
if self.tp_size < 4:
|
|
||||||
self.cuda_graph_max_bs = 256
|
|
||||||
else:
|
|
||||||
self.cuda_graph_max_bs = 512
|
|
||||||
else:
|
|
||||||
# H20, H200, B200, MI300
|
|
||||||
self.cuda_graph_max_bs = 512
|
|
||||||
else:
|
|
||||||
# Default fallback
|
|
||||||
self.cuda_graph_max_bs = 160
|
|
||||||
|
|
||||||
if self.cuda_graph_bs is None:
|
|
||||||
self.cuda_graph_bs = self._generate_cuda_graph_batch_sizes()
|
|
||||||
|
|
||||||
def _generate_cuda_graph_batch_sizes(self):
|
def _generate_cuda_graph_batch_sizes(self):
|
||||||
"""
|
"""
|
||||||
Generate the list of batch sizes for CUDA graph capture based on cuda_graph_max_bs.
|
Generate the list of batch sizes for CUDA graph capture based on cuda_graph_max_bs.
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ class TestLlama4LoRA(CustomTestCase):
|
|||||||
"--tp-size",
|
"--tp-size",
|
||||||
str(model.tp_size),
|
str(model.tp_size),
|
||||||
"--context-length",
|
"--context-length",
|
||||||
"1048576",
|
"262144",
|
||||||
"--attention-backend",
|
"--attention-backend",
|
||||||
"fa3",
|
"fa3",
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ class TestFile:
|
|||||||
|
|
||||||
suites = {
|
suites = {
|
||||||
"per-commit": [
|
"per-commit": [
|
||||||
|
TestFile("function_call/test_json_schema_constraint.py", 30),
|
||||||
TestFile("hicache/test_hicache.py", 116),
|
TestFile("hicache/test_hicache.py", 116),
|
||||||
TestFile("hicache/test_hicache_mla.py", 127),
|
TestFile("hicache/test_hicache_mla.py", 127),
|
||||||
TestFile("hicache/test_hicache_storage.py", 127),
|
TestFile("hicache/test_hicache_storage.py", 127),
|
||||||
@@ -20,11 +21,9 @@ suites = {
|
|||||||
TestFile("lora/test_lora_eviction.py", 200),
|
TestFile("lora/test_lora_eviction.py", 200),
|
||||||
TestFile("lora/test_lora_backend.py", 99),
|
TestFile("lora/test_lora_backend.py", 99),
|
||||||
TestFile("lora/test_multi_lora_backend.py", 60),
|
TestFile("lora/test_multi_lora_backend.py", 60),
|
||||||
TestFile("lora/test_lora_cuda_graph.py", 250),
|
|
||||||
TestFile("lora/test_lora_update.py", 400),
|
TestFile("lora/test_lora_update.py", 400),
|
||||||
TestFile("lora/test_lora_qwen3.py", 97),
|
TestFile("lora/test_lora_qwen3.py", 97),
|
||||||
TestFile("lora/test_lora_radix_cache.py", 100),
|
TestFile("lora/test_lora_radix_cache.py", 100),
|
||||||
TestFile("lora/test_chunked_sgmv_backend.py", 30),
|
|
||||||
TestFile("models/test_embedding_models.py", 73),
|
TestFile("models/test_embedding_models.py", 73),
|
||||||
# TestFile("models/test_clip_models.py", 52),
|
# TestFile("models/test_clip_models.py", 52),
|
||||||
TestFile("models/test_encoder_embedding_models.py", 100),
|
TestFile("models/test_encoder_embedding_models.py", 100),
|
||||||
@@ -51,7 +50,6 @@ suites = {
|
|||||||
TestFile("openai_server/features/test_reasoning_content.py", 89),
|
TestFile("openai_server/features/test_reasoning_content.py", 89),
|
||||||
TestFile("openai_server/function_call/test_openai_function_calling.py", 60),
|
TestFile("openai_server/function_call/test_openai_function_calling.py", 60),
|
||||||
TestFile("openai_server/function_call/test_tool_choice.py", 226),
|
TestFile("openai_server/function_call/test_tool_choice.py", 226),
|
||||||
TestFile("function_call/test_json_schema_constraint.py", 30),
|
|
||||||
TestFile("openai_server/validation/test_large_max_new_tokens.py", 41),
|
TestFile("openai_server/validation/test_large_max_new_tokens.py", 41),
|
||||||
TestFile("openai_server/validation/test_matched_stop.py", 60),
|
TestFile("openai_server/validation/test_matched_stop.py", 60),
|
||||||
TestFile("openai_server/validation/test_openai_server_ignore_eos.py", 85),
|
TestFile("openai_server/validation/test_openai_server_ignore_eos.py", 85),
|
||||||
@@ -144,8 +142,6 @@ suites = {
|
|||||||
TestFile("test_multi_instance_release_memory_occupation.py", 64),
|
TestFile("test_multi_instance_release_memory_occupation.py", 64),
|
||||||
],
|
],
|
||||||
"per-commit-8-gpu": [
|
"per-commit-8-gpu": [
|
||||||
# Disabled because it hangs on the CI.
|
|
||||||
# TestFile("ep/test_moe_ep.py", 181),
|
|
||||||
TestFile("hicache/test_hicache_storage_mooncake_backend.py", 800),
|
TestFile("hicache/test_hicache_storage_mooncake_backend.py", 800),
|
||||||
TestFile("lora/test_lora_llama4.py", 600),
|
TestFile("lora/test_lora_llama4.py", 600),
|
||||||
TestFile("test_disaggregation.py", 499),
|
TestFile("test_disaggregation.py", 499),
|
||||||
|
|||||||
@@ -3,7 +3,6 @@ import unittest
|
|||||||
from types import SimpleNamespace
|
from types import SimpleNamespace
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import torch
|
|
||||||
|
|
||||||
from sglang.srt.utils import is_cuda, is_hip, kill_process_tree
|
from sglang.srt.utils import is_cuda, is_hip, kill_process_tree
|
||||||
from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
|
from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
|
||||||
@@ -11,6 +10,7 @@ from sglang.test.test_utils import (
|
|||||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||||
DEFAULT_URL_FOR_TEST,
|
DEFAULT_URL_FOR_TEST,
|
||||||
CustomTestCase,
|
CustomTestCase,
|
||||||
|
is_in_ci,
|
||||||
popen_launch_server,
|
popen_launch_server,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -50,6 +50,7 @@ class TestMLADeepseekV3(CustomTestCase):
|
|||||||
self.assertGreater(metrics["accuracy"], 0.62)
|
self.assertGreater(metrics["accuracy"], 0.62)
|
||||||
|
|
||||||
|
|
||||||
|
@unittest.skipIf(is_in_ci(), "To reduce the CI execution time.")
|
||||||
class TestMLADeepseekV3DisableFusedFunc(CustomTestCase):
|
class TestMLADeepseekV3DisableFusedFunc(CustomTestCase):
|
||||||
@classmethod
|
@classmethod
|
||||||
def setUpClass(cls):
|
def setUpClass(cls):
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import multiprocessing
|
import multiprocessing
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
import unittest
|
import unittest
|
||||||
from multiprocessing import Process
|
from multiprocessing import Process
|
||||||
@@ -21,7 +21,7 @@ from sglang.test.test_utils import (
|
|||||||
|
|
||||||
TEST_SUITE = dict(
|
TEST_SUITE = dict(
|
||||||
model_path=DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
|
model_path=DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
|
||||||
mem_fraction_static=0.85,
|
mem_fraction_static=0.83,
|
||||||
dp_size=2,
|
dp_size=2,
|
||||||
tp_size=2,
|
tp_size=2,
|
||||||
)
|
)
|
||||||
@@ -214,6 +214,9 @@ def _run_sglang_subprocess(
|
|||||||
_mem_usage = get_gpu_memory_gb(rank)
|
_mem_usage = get_gpu_memory_gb(rank)
|
||||||
print(f"GPU{rank} Memory usage after resuming Sgl weights: {_mem_usage}")
|
print(f"GPU{rank} Memory usage after resuming Sgl weights: {_mem_usage}")
|
||||||
del hf_model
|
del hf_model
|
||||||
|
hf_model = None
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
time.sleep(5)
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
_curr_usage = get_gpu_memory_gb(rank)
|
_curr_usage = get_gpu_memory_gb(rank)
|
||||||
assert (
|
assert (
|
||||||
|
|||||||
Reference in New Issue
Block a user