Revert "fix some typos" (#6244)
This commit is contained in:
@@ -68,7 +68,7 @@ blackwell = [
|
||||
]
|
||||
|
||||
# HIP (Heterogeneous-computing Interface for Portability) for AMD
|
||||
# => base docker rocm/vllm-dev:20250114, not from public vLLM whl
|
||||
# => base docker rocm/vllm-dev:20250114, not from public vllm whl
|
||||
srt_hip = [
|
||||
"sglang[runtime_common]",
|
||||
"torch",
|
||||
@@ -76,7 +76,7 @@ srt_hip = [
|
||||
"outlines==0.1.11"
|
||||
]
|
||||
|
||||
# xpu is not enabled in public vLLM and torch whl,
|
||||
# xpu is not enabled in public vllm and torch whl,
|
||||
# need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
|
||||
srt_xpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"]
|
||||
|
||||
@@ -84,8 +84,8 @@ srt_xpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"]
|
||||
# https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
|
||||
srt_hpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"]
|
||||
|
||||
# CPU: currently, there are no pre-built vLLM wheels for CPU.
|
||||
# To install vLLM for CPU, please follow the instruction here:
|
||||
# CPU: currently, there are no pre-built vllm wheels for CPU.
|
||||
# To install vllm for CPU, please follow the instruction here:
|
||||
# https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html
|
||||
srt_cpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11", "torch"]
|
||||
# https://vllm-ascend.readthedocs.io/en/latest/installation.html
|
||||
|
||||
@@ -129,7 +129,7 @@ def launch_server_process_and_send_one_request(
|
||||
|
||||
|
||||
def refine_server_args(server_args: ServerArgs, compile_args: CompileArgs):
|
||||
# Disable CUDA graph and torch compile to save time
|
||||
# Disable cuda graph and torch compile to save time
|
||||
server_args.disable_cuda_graph = True
|
||||
server_args.enable_torch_compile = False
|
||||
print(f"Disable CUDA Graph and Torch Compile to save time...")
|
||||
|
||||
@@ -12,7 +12,7 @@ use_vllm_custom_allreduce = get_bool_env_var(
|
||||
)
|
||||
|
||||
if not is_hpu():
|
||||
# ROCm does not use vLLM custom allreduce
|
||||
# ROCm does not use vllm custom allreduce
|
||||
if use_vllm_custom_allreduce and not is_hip():
|
||||
try:
|
||||
import vllm._C
|
||||
|
||||
@@ -53,7 +53,7 @@ class ChatGLMConfig(PretrainedConfig):
|
||||
self.kv_channels = kv_channels
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.seq_length = seq_length
|
||||
# It is to be compatible with long LoRA.
|
||||
# It is to be compatible with long lora.
|
||||
self.max_position_embeddings = seq_length
|
||||
self.hidden_dropout = hidden_dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
|
||||
@@ -29,7 +29,7 @@ class LoadFormat(str, enum.Enum):
|
||||
class LoadConfig:
|
||||
"""
|
||||
download_dir: Directory to download and load the weights, default to the
|
||||
default cache directory of HuggingFace.
|
||||
default cache directory of huggingface.
|
||||
load_format: The format of the model weights to load:
|
||||
"auto" will try to load the weights in the safetensors format and
|
||||
fall back to the pytorch bin format if safetensors format is
|
||||
|
||||
@@ -172,7 +172,7 @@ class CustomAllreduce:
|
||||
|
||||
if not custom_ar:
|
||||
# disable because of missing custom allreduce library
|
||||
# e.g. in a non-CUDA environment
|
||||
# e.g. in a non-cuda environment
|
||||
return
|
||||
|
||||
self.group = group
|
||||
@@ -389,11 +389,11 @@ class CustomAllreduce:
|
||||
if _is_hip:
|
||||
handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr)
|
||||
handles, offsets = self._gather_ipc_meta((bytes(handle), offset))
|
||||
logger.info("Registering %d CUDA graph addresses", len(offset))
|
||||
logger.info("Registering %d cuda graph addresses", len(offset))
|
||||
ops.register_graph_buffers(self._ptr, handles, offsets)
|
||||
else:
|
||||
handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr)
|
||||
logger.info("Registering %d CUDA graph addresses", len(offset))
|
||||
logger.info("Registering %d cuda graph addresses", len(offset))
|
||||
# We cannot directly use `dist.all_gather_object` here
|
||||
# because it is incompatible with `gloo` backend under inference mode.
|
||||
# see https://github.com/pytorch/pytorch/issues/126032 for details.
|
||||
@@ -435,7 +435,7 @@ class CustomAllreduce:
|
||||
return False
|
||||
|
||||
# all reduce, assuming inp tensor is IPC registered with register_buffer,
|
||||
# or, in the context of CUDA graphs, register_graph_buffers
|
||||
# or, in the context of cuda graphs, register_graph_buffers
|
||||
def all_reduce_reg(self, inp: torch.Tensor, out: torch.Tensor = None):
|
||||
if out is None:
|
||||
out = torch.empty_like(inp)
|
||||
@@ -473,7 +473,7 @@ class CustomAllreduce:
|
||||
return out
|
||||
|
||||
def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]:
|
||||
"""The main allreduce API that provides support for CUDA graph."""
|
||||
"""The main allreduce API that provides support for cuda graph."""
|
||||
# When custom allreduce is disabled, this will be None.
|
||||
if self.disabled or not self.should_custom_ar(input):
|
||||
return None
|
||||
@@ -489,7 +489,7 @@ class CustomAllreduce:
|
||||
return torch.empty_like(input)
|
||||
else:
|
||||
if _is_hip:
|
||||
# note: outside of CUDA graph context,
|
||||
# note: outside of cuda graph context,
|
||||
# custom allreduce incurs a cost of cudaMemcpy, which should
|
||||
# be small(<=1% of overall latency) compared to the performance
|
||||
# gains of using custom kernels
|
||||
|
||||
@@ -121,14 +121,14 @@ def can_actually_p2p(
|
||||
Therefore, we have to perform a real P2P access to check if it is actually
|
||||
possible.
|
||||
|
||||
Note on p2p and CUDA IPC:
|
||||
Note on p2p and cuda IPC:
|
||||
Usually, one process uses one GPU:
|
||||
GPU src --> CUDA context src --> tensor src --> process src
|
||||
GPU src --> cuda context src --> tensor src --> process src
|
||||
|
||||
We need to combine p2p and CUDA IPC, so that:
|
||||
GPU src --> CUDA context src --> tensor src --> process src
|
||||
We need to combine p2p and cuda IPC, so that:
|
||||
GPU src --> cuda context src --> tensor src --> process src
|
||||
|shared|
|
||||
GPU tgt --> CUDA context tgt --> tensor tgt --> process tgt
|
||||
GPU tgt --> cuda context tgt --> tensor tgt --> process tgt
|
||||
That is to say, process src creates a tensor in GPU src, passes IPC handle to
|
||||
process tgt, and process tgt accesses the tensor in GPU tgt. Any operation on the
|
||||
tensor in process tgt will be reflected in the tensor in process src, because
|
||||
@@ -201,9 +201,9 @@ def can_actually_p2p(
|
||||
# then all the processes can read the cache file to check the p2p access status.
|
||||
# Note that the cache file is suffixed by the CUDA_VISIBLE_DEVICES, so that we
|
||||
# can have different cache files for different CUDA_VISIBLE_DEVICES settings,
|
||||
# e.g. used by different vLLM engines. The device id in the cache file is a
|
||||
# e.g. used by different vllm engines. The device id in the cache file is a
|
||||
# **local** device id, i.e. from 0 to num_dev-1, where num_dev is the number
|
||||
# of visible devices in the vLLM engine.
|
||||
# of visible devices in the vllm engine.
|
||||
_gpu_p2p_access_cache: Optional[Dict[str, bool]] = None
|
||||
|
||||
|
||||
|
||||
@@ -104,7 +104,7 @@ class PyNcclCommunicator:
|
||||
self.device = device
|
||||
# nccl communicator and stream will use this device
|
||||
# `torch.cuda.device` is a context manager that changes the
|
||||
# current CUDA device to the specified one
|
||||
# current cuda device to the specified one
|
||||
with torch.cuda.device(device):
|
||||
self.comm: ncclComm_t = self.nccl.ncclCommInitRank(
|
||||
self.world_size, self.unique_id, self.rank
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
# 1. We tried to use `cupy`, it calls NCCL correctly, but `cupy` itself
|
||||
# often gets stuck when initializing the NCCL communicator.
|
||||
# 2. We tried to use `torch.distributed`, but `torch.distributed.all_reduce`
|
||||
# contains many other potential CUDA APIs, that are not allowed during
|
||||
# contains many other potential cuda APIs, that are not allowed during
|
||||
# capturing the CUDA graph. For further details, please check
|
||||
# https://discuss.pytorch.org/t/pytorch-cudagraph-with-nccl-operation-failed/ .
|
||||
#
|
||||
|
||||
@@ -170,7 +170,7 @@ class GroupCoordinator:
|
||||
GroupCoordinator takes charge of all the communication operations among
|
||||
the processes in the group. It can route the communication to
|
||||
a specific implementation (e.g. switch allreduce implementation
|
||||
based on the tensor size and CUDA graph mode).
|
||||
based on the tensor size and cuda graph mode).
|
||||
"""
|
||||
|
||||
# available attributes:
|
||||
|
||||
@@ -127,7 +127,7 @@ CONTEXT_LENGTH_KEYS = [
|
||||
|
||||
|
||||
def get_context_length(config):
|
||||
"""Get the context length of a model from a HuggingFace model configs."""
|
||||
"""Get the context length of a model from a huggingface model configs."""
|
||||
text_config = config
|
||||
rope_scaling = getattr(text_config, "rope_scaling", None)
|
||||
if rope_scaling:
|
||||
|
||||
@@ -20,7 +20,7 @@ class AttentionBackend(ABC):
|
||||
raise NotImplementedError()
|
||||
|
||||
def init_cuda_graph_state(self, max_bs: int):
|
||||
"""Init the global shared states for CUDA graph."""
|
||||
"""Init the global shared states for cuda graph."""
|
||||
raise NotImplementedError()
|
||||
|
||||
def init_forward_metadata_capture_cuda_graph(
|
||||
@@ -33,7 +33,7 @@ class AttentionBackend(ABC):
|
||||
forward_mode: ForwardMode,
|
||||
spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
|
||||
):
|
||||
"""Init the metadata for a forward pass for capturing a CUDA graph."""
|
||||
"""Init the metadata for a forward pass for capturing a cuda graph."""
|
||||
raise NotImplementedError()
|
||||
|
||||
def init_forward_metadata_replay_cuda_graph(
|
||||
@@ -47,7 +47,7 @@ class AttentionBackend(ABC):
|
||||
spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
|
||||
seq_lens_cpu: Optional[torch.Tensor],
|
||||
):
|
||||
"""Init the metadata for a forward pass for replaying a CUDA graph."""
|
||||
"""Init the metadata for a forward pass for replaying a cuda graph."""
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_cuda_graph_seq_len_fill_value(self):
|
||||
|
||||
@@ -15,7 +15,7 @@ if TYPE_CHECKING:
|
||||
|
||||
class DoubleSparseAttnBackend(AttentionBackend):
|
||||
def __init__(self, model_runner: ModelRunner):
|
||||
# Lazy import to avoid the initialization of CUDA context
|
||||
# Lazy import to avoid the initialization of cuda context
|
||||
from sglang.srt.layers.attention.triton_ops.double_sparsity_attention import (
|
||||
extend_attention_fwd,
|
||||
flash_decode_attention_fwd,
|
||||
|
||||
@@ -664,7 +664,7 @@ class FlashInferIndicesUpdaterDecode:
|
||||
kv_indptr = kv_indptr[: bs + 1]
|
||||
|
||||
if wrapper.is_cuda_graph_enabled:
|
||||
# Directly write to the CUDA graph input buffer
|
||||
# Directly write to the cuda graph input buffer
|
||||
kv_indices = wrapper._paged_kv_indices_buf
|
||||
else:
|
||||
kv_indices = torch.empty(
|
||||
@@ -1173,7 +1173,7 @@ def fast_decode_plan(
|
||||
"""
|
||||
A faster version of BatchDecodeWithPagedKVCacheWrapper::plan used for FlashInferMultiStepDraftBackend.
|
||||
Modifications:
|
||||
- Remove unnecessary device-to-device copy for the CUDA graph buffers.
|
||||
- Remove unnecessary device-to-device copy for the cuda graph buffers.
|
||||
- Remove unnecessary host-to-device copy for the metadata buffers.
|
||||
"""
|
||||
batch_size = len(last_page_len)
|
||||
|
||||
@@ -874,7 +874,7 @@ def fast_mla_decode_plan(
|
||||
) -> None:
|
||||
"""A faster version of BatchMLAPagedAttentionWrapper::plan,
|
||||
for skipping the stream synchronization in original plan function during
|
||||
CUDA graph replaying.
|
||||
cuda graph replaying.
|
||||
"""
|
||||
self._causal = causal
|
||||
self._page_size = page_size
|
||||
|
||||
@@ -92,7 +92,7 @@ class TritonAttnBackend(AttentionBackend):
|
||||
skip_prefill: bool = False,
|
||||
kv_indptr_buf: Optional[torch.Tensor] = None,
|
||||
):
|
||||
# Lazy import to avoid the initialization of CUDA context
|
||||
# Lazy import to avoid the initialization of cuda context
|
||||
from sglang.srt.layers.attention.triton_ops.decode_attention import (
|
||||
decode_attention_fwd,
|
||||
)
|
||||
|
||||
@@ -257,7 +257,7 @@ class VisionFlash3Attention(nn.Module):
|
||||
**kwargs,
|
||||
):
|
||||
if not _is_cuda:
|
||||
raise Exception("VisionFlash3Attention is only available for CUDA")
|
||||
raise Exception("VisionFlash3Attention is only available for cuda")
|
||||
super().__init__()
|
||||
|
||||
def forward(
|
||||
|
||||
@@ -237,7 +237,7 @@ def dp_scatter(
|
||||
forward_batch: ForwardBatch,
|
||||
):
|
||||
# local_num_tokens is not necessarily the same as local_tokens.shape[0],
|
||||
# since local_tokens may be padded for CUDA graph
|
||||
# since local_tokens may be padded for cuda graph
|
||||
local_start_pos, local_num_tokens = get_dp_local_info(forward_batch)
|
||||
|
||||
local_tokens.fill_(0)
|
||||
|
||||
@@ -166,7 +166,7 @@ class LogitsMetadata:
|
||||
|
||||
def compute_dp_attention_metadata(self, hidden_states: torch.Tensor):
|
||||
if self.global_num_tokens_for_logprob_cpu is None:
|
||||
# we are capturing CUDA graph
|
||||
# we are capturing cuda graph
|
||||
return
|
||||
|
||||
cumtokens = torch.cumsum(self.global_num_tokens_for_logprob_gpu, dim=0)
|
||||
|
||||
@@ -38,7 +38,7 @@ try:
|
||||
except ImportError:
|
||||
VLLM_AVAILABLE = False
|
||||
|
||||
# Define empty classes as placeholders when vLLM is not available
|
||||
# Define empty classes as placeholders when vllm is not available
|
||||
class DummyConfig:
|
||||
def override_quantization_method(self, *args, **kwargs):
|
||||
return None
|
||||
@@ -109,7 +109,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
|
||||
if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE:
|
||||
raise ValueError(
|
||||
f"{quantization} quantization requires some operators from vllm. "
|
||||
"Please install vLLM by `pip install vllm==0.8.4`"
|
||||
"Please install vllm by `pip install vllm==0.8.4`"
|
||||
)
|
||||
|
||||
return QUANTIZATION_METHODS[quantization]
|
||||
@@ -231,7 +231,7 @@ original_isinstance = builtins.isinstance
|
||||
def monkey_patch_isinstance_for_vllm_base_layer(reverse: bool = False):
|
||||
"""
|
||||
Patch isinstance so that the `get_quant_method` in vllm's QuantizationConfig
|
||||
can recognize SGLang layers
|
||||
can recognize sglang layers
|
||||
"""
|
||||
if not VLLM_AVAILABLE:
|
||||
return
|
||||
@@ -267,7 +267,7 @@ def monkey_patch_isinstance_for_vllm_base_layer(reverse: bool = False):
|
||||
def monkey_patch_moe_apply(class_obj: "FusedMoEMethodBase"):
|
||||
"""
|
||||
Monkey patch the apply function of vllm's FusedMoEMethodBase.
|
||||
Convert SGLang arguments to vLLM arguments.
|
||||
Convert sglang arguments to vllm arguments.
|
||||
"""
|
||||
original_apply = class_obj.apply
|
||||
sig = inspect.signature(original_apply)
|
||||
@@ -329,6 +329,6 @@ def monkey_patch_quant_configs():
|
||||
monkey_patch_moe_apply(CompressedTensorsWNA16MoEMethod)
|
||||
|
||||
|
||||
# Only apply monkey patches if vLLM is available
|
||||
# Only apply monkey patches if vllm is available
|
||||
if VLLM_AVAILABLE:
|
||||
monkey_patch_quant_configs()
|
||||
|
||||
@@ -208,7 +208,7 @@ class BlockInt8LinearMethod(LinearMethodBase):
|
||||
|
||||
def process_weights_after_loading(self, layer: Module) -> None:
|
||||
# Block quant doesn't need to process weights after loading
|
||||
# Use torch Parameter to avoid CUDA graph capturing issue
|
||||
# Use torch Parameter to avoid cuda graph capturing issue
|
||||
layer.weight = torch.nn.Parameter(layer.weight.data, requires_grad=False)
|
||||
layer.weight_scale_inv = torch.nn.Parameter(
|
||||
layer.weight_scale_inv.data, requires_grad=False
|
||||
|
||||
@@ -363,7 +363,7 @@ class CompressedTensorsConfig(QuantizationConfig):
|
||||
if self._is_wNa16_group_channel(weight_quant, input_quant):
|
||||
if not VLLM_AVAILABLE:
|
||||
raise ImportError(
|
||||
"vLLM is not installed, to use CompressedTensorsW4A16Sparse24 and CompressedTensorsWNA16, please install vLLM"
|
||||
"vllm is not installed, to use CompressedTensorsW4A16Sparse24 and CompressedTensorsWNA16, please install vllm"
|
||||
)
|
||||
if (
|
||||
self.quant_format == CompressionFormat.marlin_24.value
|
||||
@@ -409,7 +409,7 @@ class CompressedTensorsConfig(QuantizationConfig):
|
||||
if self._is_fp8_w8a16(weight_quant, input_quant):
|
||||
if not VLLM_AVAILABLE:
|
||||
raise ImportError(
|
||||
"vLLM is not installed, to use CompressedTensorsW8A16Fp8, please install vLLM"
|
||||
"vllm is not installed, to use CompressedTensorsW8A16Fp8, please install vllm"
|
||||
)
|
||||
is_static_input_scheme = input_quant and not input_quant.dynamic
|
||||
return CompressedTensorsW8A16Fp8(
|
||||
@@ -491,7 +491,7 @@ class CompressedTensorsConfig(QuantizationConfig):
|
||||
):
|
||||
if not VLLM_AVAILABLE:
|
||||
raise ImportError(
|
||||
"vLLM is not installed, to use CompressedTensors24, please install vLLM"
|
||||
"vllm is not installed, to use CompressedTensors24, please install vllm"
|
||||
)
|
||||
# Have a valid sparsity scheme
|
||||
# Validate layer is supported by Cutlass 2:4 Kernel
|
||||
|
||||
@@ -65,7 +65,7 @@ class CompressedTensorsMoEMethod:
|
||||
if quant_config._is_wNa16_group_channel(weight_quant, input_quant):
|
||||
if not VLLM_AVAILABLE:
|
||||
raise ImportError(
|
||||
"vLLM is not installed, to use CompressedTensorsWNA16MoEMethod, please install vLLM."
|
||||
"vllm is not installed, to use CompressedTensorsWNA16MoEMethod, please install vllm."
|
||||
)
|
||||
return CompressedTensorsWNA16MoEMethod(quant_config)
|
||||
elif quant_config._is_fp8_w8a8(weight_quant, input_quant):
|
||||
|
||||
@@ -27,10 +27,10 @@ except ImportError:
|
||||
MARLIN_FP8_AVAILABLE = False
|
||||
|
||||
def apply_fp8_marlin_linear(*args, **kwargs):
|
||||
raise ImportError("vLLM is not installed")
|
||||
raise ImportError("vllm is not installed")
|
||||
|
||||
def prepare_fp8_layer_for_marlin(*args, **kwargs):
|
||||
raise ImportError("vLLM is not installed")
|
||||
raise ImportError("vllm is not installed")
|
||||
|
||||
|
||||
__all__ = ["CompressedTensorsW8A16Fp8"]
|
||||
@@ -45,7 +45,7 @@ class CompressedTensorsW8A16Fp8(CompressedTensorsScheme):
|
||||
|
||||
if not MARLIN_FP8_AVAILABLE:
|
||||
raise ImportError(
|
||||
"vLLM is not installed. To use CompressedTensorsW8A16Fp8, please install vLLM"
|
||||
"vllm is not installed. To use CompressedTensorsW8A16Fp8, please install vllm"
|
||||
)
|
||||
|
||||
@classmethod
|
||||
|
||||
@@ -357,7 +357,7 @@ def apply_fp8_linear(
|
||||
|
||||
# Fused GEMM_DQ
|
||||
if VLLM_AVAILABLE and use_vllm_cutlass_w8a8_fp8_kernel:
|
||||
# Fall back to vLLM cutlass w8a8 fp8 kernel
|
||||
# Fall back to vllm cutlass w8a8 fp8 kernel
|
||||
output = ops.cutlass_scaled_mm(
|
||||
qinput,
|
||||
weight,
|
||||
@@ -493,7 +493,7 @@ def apply_fp8_linear(
|
||||
if cutlass_fp8_supported:
|
||||
try:
|
||||
if VLLM_AVAILABLE and use_vllm_cutlass_w8a8_fp8_kernel:
|
||||
# Fall back to vLLM cutlass w8a8 fp8 kernel
|
||||
# Fall back to vllm cutlass w8a8 fp8 kernel
|
||||
output = ops.cutlass_scaled_mm(
|
||||
qinput,
|
||||
weight,
|
||||
|
||||
@@ -186,8 +186,8 @@ class LinearScalingRotaryEmbedding(RotaryEmbedding):
|
||||
|
||||
It supports multiple scaling factors. Since multiple LoRA adapters may have
|
||||
different scaling factors, we need multiple cos/sin caches. In this way,
|
||||
instead of running rotary embedding kernel per LoRA adapter, we can run multiple
|
||||
LoRA adapters in a batched way.
|
||||
instead of running rotary embedding kernel per lora, we can run multiple
|
||||
lora in a batched way.
|
||||
|
||||
In addition to that, we also keep the cos/sin cache for the scaling factor
|
||||
of 1 (default) at all times.
|
||||
|
||||
@@ -41,13 +41,13 @@ class BaseLoRABackend:
|
||||
def run_lora_a_sgemm(
|
||||
self, x: torch.Tensor, weights: torch.Tensor, *args, **kwargs
|
||||
) -> torch.Tensor:
|
||||
"""Run segment Gemm of LoRA a modules with current backend.
|
||||
"""Run segment Gemm of lora a modules with current backend.
|
||||
The definition of segment Gemm can be referred to https://docs.flashinfer.ai/api/gemm.html.
|
||||
|
||||
Args:
|
||||
x: input matrix with shape (s, input_dim), here s is the sum of all sequence lengths
|
||||
weights: a set of LoRA weights with shape (num_lora, c * r, input_dim),
|
||||
here r is LoRA rank, c is a multiplier for stacked modules (e.g., c=3 for qkv_proj, c=2 for gate_up_proj)
|
||||
weights: a set of lora weights with shape (num_lora, c * r, input_dim),
|
||||
here r is lora rank, c is a multiplier for stacked modules (e.g., c=3 for qkv_proj, c=2 for gate_up_proj)
|
||||
usually input_dim is much larger than r
|
||||
Returns:
|
||||
result with shape (s, c * r)
|
||||
@@ -57,12 +57,12 @@ class BaseLoRABackend:
|
||||
def run_lora_b_sgemm(
|
||||
self, x: torch.Tensor, weights: torch.Tensor, *args, **kwargs
|
||||
) -> torch.Tensor:
|
||||
"""Run segment Gemm of LoRA b modules with current backend.
|
||||
"""Run segment Gemm of lora b modules with current backend.
|
||||
The definition of segment Gemm can be referred to https://docs.flashinfer.ai/api/gemm.html.
|
||||
|
||||
Args:
|
||||
x: input matrix with shape (s, r), here s is the sum of all sequence lengths, r is LoRA rank
|
||||
weights: a set of LoRA weights with shape (num_lora, output_dim, r)
|
||||
x: input matrix with shape (s, r), here s is the sum of all sequence lengths, r is lora rank
|
||||
weights: a set of lora weights with shape (num_lora, output_dim, r)
|
||||
usually output_dim is much larger than r
|
||||
Returns:
|
||||
result with shape (s, output_dim)
|
||||
@@ -77,7 +77,7 @@ class BaseLoRABackend:
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> torch.Tensor:
|
||||
"""Run the LoRA pass for QKV Layer.
|
||||
"""Run the lora pass for QKV Layer.
|
||||
|
||||
Args:
|
||||
x: input matrix with shape (s, input_dim), here s is the sum of all sequence lengths
|
||||
@@ -100,7 +100,7 @@ class BaseLoRABackend:
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> torch.Tensor:
|
||||
"""Run the LoRA pass for gate_up_proj, usually attached to MergedColumnParallelLayer.
|
||||
"""Run the lora pass for gate_up_proj, usually attached to MergedColumnParallelLayer.
|
||||
|
||||
Args:
|
||||
x: input matrix with shape (s, input_dim), here s is the sum of all sequence lengths
|
||||
|
||||
@@ -117,7 +117,7 @@ class FlashInferLoRABackend(BaseLoRABackend):
|
||||
dtype=x.dtype,
|
||||
)
|
||||
|
||||
# Compute LoRA for gate and up proj respectively
|
||||
# Compute lora for gate and up proj respectively
|
||||
lora_output[:, :output_dim] = self.run_lora_b_sgemm(
|
||||
x=lora_a_output[:, :lora_rank].contiguous(),
|
||||
weights=gate_up_lora_b[0],
|
||||
|
||||
@@ -198,7 +198,7 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
|
||||
if self.lora_backend.fuse_stacked_lora_b:
|
||||
assert (
|
||||
B_buffer_q.shape[-1] == B_buffer_kv.shape[-1]
|
||||
), "The LoRA rank of q and kv should be the same when enabling fusion of qkv lora_b"
|
||||
), "The lora rank of q and kv should be the same when enabling fusion of qkv lora_b"
|
||||
output_dim_q, output_dim_kv = B_buffer_q.shape[-2], B_buffer_kv.shape[-2]
|
||||
|
||||
# B_buffer_qkv: (num_lora, output_dim_q + 2 * output_dim_kv, r)
|
||||
|
||||
@@ -40,7 +40,7 @@ class LoRALayer(nn.Module):
|
||||
self.config: LoRAConfig = config
|
||||
self.base_hf_config: AutoConfig = base_hf_config
|
||||
|
||||
# LoRA weights in cpu. The weights are loaded from checkpoint.
|
||||
# lora weights in cpu. The weights are loaded from checkpoint.
|
||||
self.weights: Dict[str, torch.Tensor] = {}
|
||||
|
||||
|
||||
@@ -97,7 +97,7 @@ class LoRAAdapter(nn.Module):
|
||||
|
||||
def stack_qkv_proj(self, weight_names: List[str], weights: Dict[str, torch.Tensor]):
|
||||
|
||||
# Collect target q/k/v modules. This process is necessary since there might be no LoRA attached to k_proj
|
||||
# Collect target q/k/v modules. This process is necessary since there might be no lora attached to k_proj
|
||||
target_module = set()
|
||||
for weight_name in weight_names:
|
||||
if "k_proj" in weight_name:
|
||||
@@ -110,7 +110,7 @@ class LoRAAdapter(nn.Module):
|
||||
return
|
||||
|
||||
for weight_name in weight_names:
|
||||
# We assume every LoRA adaptor should contain LoRA modules for q_proj
|
||||
# We assume every lora adaptor should contain lora modules for q_proj
|
||||
if "q_proj" in weight_name:
|
||||
q_name = weight_name
|
||||
k_name = weight_name.replace("q_proj", "k_proj")
|
||||
@@ -118,7 +118,7 @@ class LoRAAdapter(nn.Module):
|
||||
kv_name = weight_name.replace("q_proj", "kv_proj")
|
||||
qkv_name = weight_name.replace("q_proj", "qkv_proj")
|
||||
|
||||
# If k_proj doesn't have LoRA, initialize it to zero
|
||||
# If k_proj doesn't have lora, initialize it to zero
|
||||
k_proj_weight = (
|
||||
weights[k_name]
|
||||
if "k_proj" in target_module
|
||||
|
||||
@@ -93,14 +93,14 @@ class LoRAManager:
|
||||
# Config of each LoRA adapter
|
||||
self.configs: Dict[str, LoRAConfig] = {}
|
||||
|
||||
# Target module names in HuggingFace LoRA configs.
|
||||
# Target module names in huggingface lora configs.
|
||||
# e.g., {"k_proj", "q_proj", "v_proj", "o_proj"}
|
||||
self.hf_target_names: Set[str] = set()
|
||||
for name, path in self.lora_paths.items():
|
||||
self.configs[name] = LoRAConfig(path)
|
||||
self.hf_target_names.update(self.configs[name].target_modules)
|
||||
|
||||
# Target LoRA weight names for lora_a and lora_b modules respectively.
|
||||
# Target lora weight names for lora_a and lora_b modules respectively.
|
||||
# e.g., {("qkv_proj", "q_proj"), ("qkv_proj", "kv_proj")}
|
||||
self.lora_weight_names: Set[Tuple[str]] = set(
|
||||
[get_stacked_name(module) for module in self.hf_target_names]
|
||||
@@ -119,11 +119,11 @@ class LoRAManager:
|
||||
lora_adapter.initialize_weights()
|
||||
self.loras[name] = lora_adapter
|
||||
|
||||
# misc LoRA configs
|
||||
# misc lora configs
|
||||
self.max_lora_dim: int = max([x.hf_config["r"] for x in self.configs.values()])
|
||||
|
||||
if self.lora_backend == "flashinfer":
|
||||
# FIXME: remove the restrictions after supporting multi-rank for flashinfer backend
|
||||
# FIXME remove the restrictions after supporting multi-rank for flashinfer backend
|
||||
max_lora_dim = max([x.hf_config["r"] for x in self.configs.values()])
|
||||
scaling = list(self.loras.values())[0].scaling
|
||||
assert all(x.hf_config["r"] == max_lora_dim for x in self.configs.values())
|
||||
@@ -144,16 +144,16 @@ class LoRAManager:
|
||||
self.lora_modules,
|
||||
)
|
||||
|
||||
# Initialize target LoRA modules in memory pool
|
||||
# Initialize target lora modules in memory pool
|
||||
self.memory_pool.init_buffers(self.lora_weight_names, self.base_model)
|
||||
|
||||
def prepare_lora_batch(self, forward_batch: ForwardBatch):
|
||||
# load active LoRAs into LoRA memory pool
|
||||
# load active loras into lora memory pool
|
||||
cur_uids = set(forward_batch.lora_paths)
|
||||
assert len(cur_uids) <= self.max_loras_per_batch
|
||||
self.memory_pool.prepare_lora_batch(cur_uids, self.loras)
|
||||
|
||||
# set up batch info shared by all LoRA modules
|
||||
# set up batch info shared by all lora modules
|
||||
bs = forward_batch.batch_size
|
||||
|
||||
if (
|
||||
@@ -221,7 +221,7 @@ class LoRAManager:
|
||||
)
|
||||
self.lora_backend.set_batch_info(batch_info)
|
||||
|
||||
# call set_lora_info for each LoRA modules
|
||||
# call set_lora_info for each lora modules
|
||||
for layer_id, modules in self.lora_modules.items():
|
||||
for module_name, module in modules:
|
||||
if "qkv_proj" in module_name:
|
||||
|
||||
@@ -16,7 +16,7 @@ from sglang.srt.lora.utils import (
|
||||
|
||||
|
||||
class LoRAMemoryPool:
|
||||
"""Class for memory pool management of LoRA modules"""
|
||||
"""Class for memory pool management of lora modules"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -38,7 +38,7 @@ class LoRAMemoryPool:
|
||||
self.tp_rank: int = tp_rank
|
||||
self.lora_modules: Dict[int, List[Tuple[str, BaseLayerWithLoRA]]] = lora_modules
|
||||
|
||||
# Both A_buffer and B_buffer maps LoRA weight names to its buffer space.
|
||||
# Both A_buffer and B_buffer maps lora weight names to its buffer space.
|
||||
# A_buffer contains num_layer number of row-major tensors with shape
|
||||
# (max_loras_per_batch, stacked_num * max_lora_dim, input_dim)
|
||||
# B_buffer contains num_layer number of column-major tensors with shape
|
||||
@@ -46,10 +46,10 @@ class LoRAMemoryPool:
|
||||
self.A_buffer: Dict[str, List[torch.Tensor]] = {}
|
||||
self.B_buffer: Dict[str, List[torch.Tensor]] = {}
|
||||
|
||||
# LoRA uid -> buffer idx in memory pool
|
||||
# Lora uid -> buffer idx in memory pool
|
||||
self.uid_to_buffer_id: Dict[Optional[str], int] = {}
|
||||
|
||||
# Buffer idx -> LoRA uid in memory pool
|
||||
# Buffer idx -> lora uid in memory pool
|
||||
# All uids are initialized as empty strings for empty buffer slots
|
||||
# Here we don't initialize to None since None is a valid uid
|
||||
self.buffer_id_to_uid: List[Optional[str]] = [""] * self.max_loras_per_batch
|
||||
@@ -95,7 +95,7 @@ class LoRAMemoryPool:
|
||||
base_model: torch.nn.Module,
|
||||
):
|
||||
|
||||
# lora_weight_names is a set of name pairs indicating each pair of LoRA modules to load
|
||||
# lora_weight_names is a set of name pairs indicating each pair of lora modules to load
|
||||
# e.g., {("qkv_proj", "q_proj"), ("qkv_proj", "kv_proj"), ("o_proj", "o_proj")}
|
||||
self.lora_weight_names: Set[Tuple[str]] = lora_weight_names
|
||||
device = next(base_model.parameters()).device
|
||||
@@ -137,7 +137,7 @@ class LoRAMemoryPool:
|
||||
return buffer_id, ""
|
||||
|
||||
for buffer_id in range(self.max_loras_per_batch):
|
||||
# Evict unneeded LoRA
|
||||
# Evict unneeded lora
|
||||
if self.buffer_id_to_uid[buffer_id] not in cur_uids:
|
||||
return buffer_id, self.buffer_id_to_uid[buffer_id]
|
||||
|
||||
|
||||
@@ -37,7 +37,7 @@ def _gate_up_lora_b_kernel(
|
||||
):
|
||||
# This kernel packs 2 sgemms (gate/up) into a single kernel.
|
||||
|
||||
# x: (s, 2 * K), s is the sum of sequence lengths, K equals to LoRA rank
|
||||
# x: (s, 2 * K), s is the sum of sequence lengths, K equals to lora rank
|
||||
# weights: (num_lora, 2 * output_dim, K)
|
||||
# output: (s, 2 * output_dim)
|
||||
# output_dim >> K
|
||||
|
||||
@@ -39,7 +39,7 @@ def _qkv_lora_b_kernel(
|
||||
):
|
||||
# This kernel packs 3 sgemms (q/k/v) into a single kernel.
|
||||
|
||||
# x: (s, 3 * K), s is the sum of sequence lengths, K equals to LoRA rank
|
||||
# x: (s, 3 * K), s is the sum of sequence lengths, K equals to lora rank
|
||||
# weights: (num_lora, N_Q + 2 * N_KV, K)
|
||||
# output: (s, N_Q + 2 * N_KV)
|
||||
# N_Q >> K, N_KV >> K
|
||||
|
||||
@@ -22,13 +22,13 @@ class LoRABatchInfo:
|
||||
# Maximum sequence length of current batch
|
||||
max_len: int
|
||||
|
||||
# The index of LoRA adapter used by each sequence, in shape (bs,)
|
||||
# The index of lora adapter used by each sequence, in shape (bs,)
|
||||
weight_indices: torch.Tensor
|
||||
|
||||
# ranks of each LoRA adapter, in shape (lora_num,)
|
||||
# ranks of each lora adapter, in shape (lora_num,)
|
||||
lora_ranks: torch.Tensor
|
||||
|
||||
# scaling of each LoRA adapter, in shape (lora_num,)
|
||||
# scaling of each lora adapter, in shape (lora_num,)
|
||||
scalings: torch.Tensor
|
||||
|
||||
|
||||
@@ -51,9 +51,9 @@ def get_customized_names_from_hf_names(
|
||||
hf_module_names: Set[str], base_model: torch.nn.Module
|
||||
) -> Set[str]:
|
||||
"""
|
||||
This function takes in a set of HuggingFace style module names:
|
||||
This function takes in a set of huggingface style module names:
|
||||
e.g., {"k_proj", "q_proj", "v_proj", "o_proj"}
|
||||
and outputs a set of module names of customized SGLang layers:
|
||||
and outputs a set of module names of customized sglang layers:
|
||||
e.g., {"qkv_proj", "o_proj"}
|
||||
"""
|
||||
if hasattr(base_model, "get_module_name"):
|
||||
@@ -87,7 +87,7 @@ def get_hidden_dim(
|
||||
else:
|
||||
"""
|
||||
WARNING: get_hidden_dim() is not defined,
|
||||
which is used to get the hidden dim for different LoRA modules
|
||||
which is used to get the hidden dim for different lora modules
|
||||
Use the default one, but please check if it is correct for your model.
|
||||
Please implement the function in the model class if it is not.
|
||||
You can reference this function in llama.py.
|
||||
@@ -108,7 +108,7 @@ def get_hidden_dim(
|
||||
|
||||
def get_stacked_name(name: str) -> Tuple[str]:
|
||||
"""
|
||||
Mapping a target LoRA module name to (stacked name for LoRA A, stacked name for LoRA B)
|
||||
Mapping a target module name to (stacked name for Lora A, stacked name for Lora B)
|
||||
"""
|
||||
params_mapping = {
|
||||
"q_proj": ("qkv_proj", "q_proj"),
|
||||
@@ -122,7 +122,7 @@ def get_stacked_name(name: str) -> Tuple[str]:
|
||||
|
||||
def get_stacked_multiply(module_name: str) -> int:
|
||||
"""
|
||||
Mapping a module name to its magnification at output dimension
|
||||
Mapping a lora module name to its magnification at output dimension
|
||||
"""
|
||||
stacked_rank = {
|
||||
"qkv_proj": 3,
|
||||
@@ -137,7 +137,7 @@ def get_weight_name(
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
target_name is name of a given module,
|
||||
lora_weight_names is a set of LoRA stacked name pairs (see get_stacked_name method above)
|
||||
lora_weight_names is a set of lora stacked name pairs (see get_stacked_name method above)
|
||||
If there is a weight name in lora_weight_names that can match target_name, return this name
|
||||
Else raise ValueError.
|
||||
"""
|
||||
|
||||
@@ -1667,7 +1667,7 @@ class Scheduler(
|
||||
can_cuda_graph = 0
|
||||
|
||||
if not spec_algorithm.is_none():
|
||||
# TODO(sang): Support CUDA graph when idle batch is there.
|
||||
# TODO(sang): Support cuda graph when idle batch is there.
|
||||
if local_batch is None or local_batch.forward_mode.is_idle():
|
||||
can_cuda_graph = 0
|
||||
|
||||
@@ -1704,7 +1704,7 @@ class Scheduler(
|
||||
local_batch.global_num_tokens = global_num_tokens
|
||||
local_batch.global_num_tokens_for_logprob = global_num_tokens_for_logprob
|
||||
|
||||
# Check forward mode for CUDA graph
|
||||
# Check forward mode for cuda graph
|
||||
if not disable_cuda_graph:
|
||||
local_batch.can_run_dp_cuda_graph = can_cuda_graph
|
||||
|
||||
|
||||
@@ -238,7 +238,7 @@ class TokenizerManager:
|
||||
self.metrics_collector = TokenizerMetricsCollector(
|
||||
labels={
|
||||
"model_name": self.server_args.served_model_name,
|
||||
# TODO: Add LoRA name/path in the future,
|
||||
# TODO: Add lora name/path in the future,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@@ -213,7 +213,7 @@ class TpModelWorkerClient:
|
||||
penalizer_orchestrator=None,
|
||||
)
|
||||
|
||||
# A CUDA stream sync here to avoid the CUDA illegal memory access error.
|
||||
# A cuda stream sync here to avoid the cuda illegal memory access error.
|
||||
self.scheduler_stream.synchronize()
|
||||
|
||||
# Push a new batch to the queue
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Run the model with CUDA graph and torch.compile."""
|
||||
"""Run the model with cuda graph and torch.compile."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
@@ -127,7 +127,7 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
|
||||
else:
|
||||
capture_bs = [1, 2, 4, 8] + list(range(16, 161, 8))
|
||||
else:
|
||||
# Since speculative decoding requires more CUDA graph memory, we
|
||||
# Since speculative decoding requires more cuda graph memory, we
|
||||
# capture less.
|
||||
capture_bs = (
|
||||
list(range(1, 9)) + list(range(10, 33, 2)) + list(range(40, 161, 16))
|
||||
@@ -161,7 +161,7 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
|
||||
return capture_bs, compile_bs
|
||||
|
||||
|
||||
# Reuse this memory pool across all CUDA graph runners.
|
||||
# Reuse this memory pool across all cuda graph runners.
|
||||
global_graph_memory_pool = None
|
||||
|
||||
|
||||
@@ -175,7 +175,7 @@ def set_global_graph_memory_pool(val):
|
||||
|
||||
|
||||
class CudaGraphRunner:
|
||||
"""A CudaGraphRunner runs the forward pass of a model with CUDA graph and torch.compile."""
|
||||
"""A CudaGraphRunner runs the forward pass of a model with cuda graph and torch.compile."""
|
||||
|
||||
def __init__(self, model_runner: ModelRunner):
|
||||
# Parse args
|
||||
@@ -194,7 +194,7 @@ class CudaGraphRunner:
|
||||
|
||||
# Batch sizes to capture
|
||||
self.capture_bs, self.compile_bs = get_batch_sizes_to_capture(model_runner)
|
||||
rank0_log(f"Capture CUDA graph bs {self.capture_bs}")
|
||||
rank0_log(f"Capture cuda graph bs {self.capture_bs}")
|
||||
self.capture_forward_mode = ForwardMode.DECODE
|
||||
self.capture_hidden_mode = CaptureHiddenMode.NULL
|
||||
self.num_tokens_per_bs = 1
|
||||
@@ -334,8 +334,8 @@ class CudaGraphRunner:
|
||||
else forward_batch.batch_size <= self.max_bs
|
||||
)
|
||||
|
||||
# NOTE: CUDA graph cannot handle mixed batch (encoder_len = 0)
|
||||
# If mixed batch cannot be supported, then encoder_lens can be removed in CUDA graph
|
||||
# NOTE: cuda graph cannot handle mixed batch (encoder_len = 0)
|
||||
# If mixed batch cannot be supported, then encoder_lens can be removed in cuda graph
|
||||
# because the full_text_row_masked_out_mask tensor will always be ones
|
||||
is_encoder_lens_supported = (
|
||||
torch.all(forward_batch.encoder_lens > 0)
|
||||
@@ -350,7 +350,7 @@ class CudaGraphRunner:
|
||||
avail_mem = get_available_gpu_memory(
|
||||
self.model_runner.device, self.model_runner.gpu_id, empty_cache=False
|
||||
)
|
||||
# Reverse the order to enable better memory sharing across CUDA graphs.
|
||||
# Reverse the order to enable better memory sharing across cuda graphs.
|
||||
capture_range = (
|
||||
tqdm.tqdm(list(reversed(self.capture_bs)))
|
||||
if get_tensor_model_parallel_rank() == 0
|
||||
@@ -429,9 +429,9 @@ class CudaGraphRunner:
|
||||
spec_info.capture_hidden_mode if spec_info else CaptureHiddenMode.NULL
|
||||
)
|
||||
if self.model_runner.server_args.lora_paths is not None:
|
||||
# Currently, if the lora_path in `lora_paths` is None, the LoRA backend will use a
|
||||
# different logic to handle LoRA, so we need to set `lora_paths` to a list of non-None
|
||||
# values if LoRA is enabled.
|
||||
# Currently, if the lora_path in `lora_paths` is None, the lora backend will use a
|
||||
# different logic to handle lora, so we need to set `lora_paths` to a list of non-None
|
||||
# values if lora is enabled.
|
||||
lora_paths = [next(iter(self.model_runner.server_args.lora_paths))] * bs
|
||||
else:
|
||||
lora_paths = None
|
||||
|
||||
@@ -229,7 +229,7 @@ class ForwardBatch:
|
||||
# For DP attention
|
||||
global_num_tokens_cpu: Optional[List[int]] = None
|
||||
global_num_tokens_gpu: Optional[torch.Tensor] = None
|
||||
# Has to be None when CUDA graph is captured.
|
||||
# Has to be None when cuda graph is captured.
|
||||
global_num_tokens_for_logprob_cpu: Optional[List[int]] = None
|
||||
global_num_tokens_for_logprob_gpu: Optional[torch.Tensor] = None
|
||||
# for extend, local start pos and num tokens is different in logits processor
|
||||
@@ -356,7 +356,7 @@ class ForwardBatch:
|
||||
if model_runner.model_is_mrope:
|
||||
ret._compute_mrope_positions(model_runner, batch)
|
||||
|
||||
# Init LoRA information
|
||||
# Init lora information
|
||||
if model_runner.server_args.lora_paths is not None:
|
||||
model_runner.lora_manager.prepare_lora_batch(ret)
|
||||
|
||||
|
||||
@@ -225,7 +225,7 @@ class ModelRunner:
|
||||
if self.tp_size > 1 and supports_torch_tp:
|
||||
self.apply_torch_tp()
|
||||
|
||||
# Init LoRA
|
||||
# Init lora
|
||||
if server_args.lora_paths is not None:
|
||||
self.init_lora_manager()
|
||||
|
||||
@@ -1009,11 +1009,11 @@ class ModelRunner:
|
||||
)
|
||||
|
||||
def init_cuda_graphs(self):
|
||||
"""Capture CUDA graphs."""
|
||||
"""Capture cuda graphs."""
|
||||
self.cuda_graph_runner = None
|
||||
|
||||
if not self.is_generation:
|
||||
# TODO: Currently, CUDA graph only captures decode steps, which only exists for generation models
|
||||
# TODO: Currently, cuda graph only captures decode steps, which only exists for generation models
|
||||
return
|
||||
|
||||
if self.server_args.disable_cuda_graph:
|
||||
@@ -1022,12 +1022,12 @@ class ModelRunner:
|
||||
tic = time.time()
|
||||
before_mem = get_available_gpu_memory(self.device, self.gpu_id)
|
||||
logger.info(
|
||||
f"Capture CUDA graph begin. This can take up to several minutes. avail mem={before_mem:.2f} GB"
|
||||
f"Capture cuda graph begin. This can take up to several minutes. avail mem={before_mem:.2f} GB"
|
||||
)
|
||||
self.cuda_graph_runner = CudaGraphRunner(self)
|
||||
after_mem = get_available_gpu_memory(self.device, self.gpu_id)
|
||||
logger.info(
|
||||
f"Capture CUDA graph end. Time elapsed: {time.time() - tic:.2f} s. "
|
||||
f"Capture cuda graph end. Time elapsed: {time.time() - tic:.2f} s. "
|
||||
f"mem usage={(before_mem - after_mem):.2f} GB. avail mem={after_mem:.2f} GB."
|
||||
)
|
||||
|
||||
|
||||
@@ -393,7 +393,7 @@ class CohereForCausalLM(nn.Module):
|
||||
weight_loader(param, loaded_weight, shard_id)
|
||||
break
|
||||
else:
|
||||
# lm_head is not used in vLLM as it is tied with embed_token.
|
||||
# lm_head is not used in vllm as it is tied with embed_token.
|
||||
# To prevent errors, skip loading lm_head.weight.
|
||||
if "lm_head.weight" in name:
|
||||
continue
|
||||
|
||||
@@ -1190,7 +1190,7 @@ class CLIPVisionTower(nn.Module):
|
||||
# vision_tower = create_sam_vit(**vision_tower_params)
|
||||
forward_kwargs = dict()
|
||||
|
||||
else: # HuggingFace
|
||||
else: # huggingface
|
||||
from transformers import CLIPVisionModel
|
||||
|
||||
vision_tower = CLIPVisionModel.from_pretrained(**vision_tower_params)
|
||||
|
||||
@@ -342,7 +342,7 @@ class GemmaForCausalLM(nn.Module):
|
||||
weight_loader(param, loaded_weight, shard_id)
|
||||
break
|
||||
else:
|
||||
# lm_head is not used in vLLM as it is tied with embed_token.
|
||||
# lm_head is not used in vllm as it is tied with embed_token.
|
||||
# To prevent errors, skip loading lm_head.weight.
|
||||
if "lm_head.weight" in name:
|
||||
continue
|
||||
|
||||
@@ -441,7 +441,7 @@ class Gemma2ForCausalLM(nn.Module):
|
||||
weight_loader(param, loaded_weight, shard_id)
|
||||
break
|
||||
else:
|
||||
# lm_head is not used in vLLM as it is tied with embed_token.
|
||||
# lm_head is not used in vllm as it is tied with embed_token.
|
||||
# To prevent errors, skip loading lm_head.weight.
|
||||
if "lm_head.weight" in name:
|
||||
continue
|
||||
|
||||
@@ -174,7 +174,7 @@ class Gemma3Attention(nn.Module):
|
||||
# Local attention. Override the values in config.json.
|
||||
self.rope_theta = config.rope_local_base_freq
|
||||
self.rope_scaling = {"rope_type": "default"}
|
||||
# FIXME(mick): idk why vLLM does this
|
||||
# FIXME(mick): idk why vllm does this
|
||||
# self.sliding_window = config.interleaved_sliding_window
|
||||
self.sliding_window = get_attention_sliding_window_size(config)
|
||||
else:
|
||||
@@ -667,7 +667,7 @@ class Gemma3ForCausalLM(PreTrainedModel):
|
||||
weight_loader(param, loaded_weight, shard_id)
|
||||
break
|
||||
else:
|
||||
# lm_head is not used in vLLM as it is tied with embed_token.
|
||||
# lm_head is not used in vllm as it is tied with embed_token.
|
||||
# To prevent errors, skip loading lm_head.weight.
|
||||
if "lm_head.weight" in name:
|
||||
continue
|
||||
|
||||
@@ -418,7 +418,7 @@ class MoonVitEncoderLayer(nn.Module):
|
||||
hidden_dim: int,
|
||||
mlp_dim: int,
|
||||
*,
|
||||
attn_implementation: str = "flash_attention_2", # use fa2 in SGLang by default
|
||||
attn_implementation: str = "flash_attention_2", # use fa2 in sglang by default
|
||||
activation=F.gelu,
|
||||
attn_bias: bool = False,
|
||||
):
|
||||
|
||||
@@ -537,8 +537,8 @@ class MllamaTextCrossAttention(nn.Module):
|
||||
quant_config=quant_config,
|
||||
prefix=add_prefix("o_proj", prefix),
|
||||
)
|
||||
# vllm.model_executor.layers.layernorm.RMSNorm has a precision issue,
|
||||
# use HuggingFace's instead
|
||||
# vllm.model_executor.layers.layernorm.RMSNorm has precision issue,
|
||||
# use huggingface's instead
|
||||
self.q_norm = MllamaTextRMSNorm(self.head_dim, eps=config.rms_norm_eps)
|
||||
self.k_norm = MllamaTextRMSNorm(self.head_dim, eps=config.rms_norm_eps)
|
||||
self.scaling = self.head_dim**-0.5
|
||||
@@ -979,8 +979,8 @@ class MllamaForConditionalGeneration(nn.Module):
|
||||
cross_attention_states = None
|
||||
|
||||
if self.capture_mode:
|
||||
# NOTE: when doing CUDA graph capture, we do not want to skip cross attention
|
||||
# Make is a constant value to avoid CUDA graph capture issue
|
||||
# NOTE: when doing cuda graph capture, we do not want to skip cross attention
|
||||
# Make is a constant value to avoid cuda graph capture issue
|
||||
skip_cross_attention = False
|
||||
else:
|
||||
# NOTE: we do not need image_inputs when prefill
|
||||
|
||||
@@ -57,7 +57,7 @@ class RobertaEmbedding(nn.Module):
|
||||
input_shape = input_ids.size()
|
||||
inputs_embeds = self.word_embeddings(input_ids)
|
||||
|
||||
# Adapted from vLLM: https://github.com/vllm-project/vllm/commit/4a18fd14ba4a349291c798a16bf62fa8a9af0b6b/vllm/model_executor/models/roberta.py
|
||||
# Adapted from vllm: https://github.com/vllm-project/vllm/commit/4a18fd14ba4a349291c798a16bf62fa8a9af0b6b/vllm/model_executor/models/roberta.py
|
||||
|
||||
pos_list = []
|
||||
token_list = []
|
||||
|
||||
@@ -67,7 +67,7 @@ class Platform:
|
||||
# Real device name of current platform.
|
||||
device_name: str
|
||||
|
||||
# For specifying torch device for CUDA alike platform's capability.
|
||||
# For specifying torch device for cuda alike platform's capability.
|
||||
device_type: str
|
||||
|
||||
# The torch.distributed backend on current platform
|
||||
@@ -254,7 +254,7 @@ class Platform:
|
||||
@classmethod
|
||||
def check_and_update_lora_backend(cls, backend: str) -> str:
|
||||
"""
|
||||
Check and update the LoRA backend for the current platform.
|
||||
Check and update the lora backend for the current platform.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@@ -246,7 +246,7 @@ class ServerArgs:
|
||||
self.mem_fraction_static = min(
|
||||
mem_fraction + 48 * 1024 * (1 - mem_fraction) / gpu_mem,
|
||||
(gpu_mem - 1024 * 18)
|
||||
/ gpu_mem, # 15 GB + additional 3GB for CUDA graph
|
||||
/ gpu_mem, # 15 GB + additional 3GB for cuda graph
|
||||
)
|
||||
|
||||
# Set chunked prefill size, which depends on the gpu memory capacity
|
||||
@@ -276,9 +276,9 @@ class ServerArgs:
|
||||
)
|
||||
self.page_size = 128
|
||||
|
||||
# Set CUDA graph max batch size
|
||||
# Set cuda graph max batch size
|
||||
if self.cuda_graph_max_bs is None:
|
||||
# Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable CUDA graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating CUDA graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable CUDA graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating CUDA graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating CUDA graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
|
||||
# Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
|
||||
if gpu_mem is not None and gpu_mem < 25_000:
|
||||
if self.tp_size < 4:
|
||||
self.cuda_graph_max_bs = 8
|
||||
@@ -729,7 +729,7 @@ class ServerArgs:
|
||||
"--download-dir",
|
||||
type=str,
|
||||
default=ServerArgs.download_dir,
|
||||
help="Model download directory for HuggingFace.",
|
||||
help="Model download directory for huggingface.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--base-gpu-id",
|
||||
@@ -1024,12 +1024,12 @@ class ServerArgs:
|
||||
parser.add_argument(
|
||||
"--disable-cuda-graph",
|
||||
action="store_true",
|
||||
help="Disable CUDA graph.",
|
||||
help="Disable cuda graph.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--disable-cuda-graph-padding",
|
||||
action="store_true",
|
||||
help="Disable CUDA graph when padding is needed. Still uses CUDA graph when padding is not needed.",
|
||||
help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enable-nccl-nvls",
|
||||
@@ -1075,7 +1075,7 @@ class ServerArgs:
|
||||
parser.add_argument(
|
||||
"--enable-ep-moe",
|
||||
action="store_true",
|
||||
help="Enabling expert parallelism for MoE. The ep size is equal to the tp size.",
|
||||
help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enable-torch-compile",
|
||||
@@ -1092,13 +1092,13 @@ class ServerArgs:
|
||||
"--cuda-graph-max-bs",
|
||||
type=int,
|
||||
default=ServerArgs.cuda_graph_max_bs,
|
||||
help="Set the maximum batch size for CUDA graph. It will extend the CUDA graph capture batch size to this value.",
|
||||
help="Set the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cuda-graph-bs",
|
||||
type=int,
|
||||
nargs="+",
|
||||
help="Set the list of batch sizes for CUDA graph.",
|
||||
help="Set the list of batch sizes for cuda graph.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--torchao-config",
|
||||
@@ -1334,7 +1334,7 @@ class ServerArgs:
|
||||
self.max_loras_per_batch > 0
|
||||
# FIXME
|
||||
and (self.lora_paths is None or self.disable_radix_cache)
|
||||
), "compatibility of LoRA and CUDA graph and RadixAttention is in progress"
|
||||
), "compatibility of lora and cuda graph and radix attention is in progress"
|
||||
assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative"
|
||||
assert self.gpu_id_step >= 1, "gpu_id_step must be positive"
|
||||
|
||||
|
||||
@@ -78,7 +78,7 @@ class EAGLEWorker(TpModelWorker):
|
||||
# Override context length with target model's context length
|
||||
server_args.context_length = target_worker.model_runner.model_config.context_len
|
||||
|
||||
# Do not capture CUDA graph in `super().__init__()`
|
||||
# Do not capture cuda graph in `super().__init__()`
|
||||
# It will be captured later.
|
||||
backup_disable_cuda_graph = server_args.disable_cuda_graph
|
||||
server_args.disable_cuda_graph = True
|
||||
@@ -136,7 +136,7 @@ class EAGLEWorker(TpModelWorker):
|
||||
# Share the embedding and lm_head
|
||||
self.draft_model_runner.model.set_embed_and_head(embed, head)
|
||||
|
||||
# Init attention backend and CUDA graphs
|
||||
# Init attention backend and cuda graphs
|
||||
self.draft_model_runner.server_args.disable_cuda_graph = (
|
||||
backup_disable_cuda_graph
|
||||
)
|
||||
@@ -148,7 +148,7 @@ class EAGLEWorker(TpModelWorker):
|
||||
self.init_cuda_graphs()
|
||||
|
||||
def init_attention_backend(self):
|
||||
# Create multi-step attn backends and CUDA graph runners
|
||||
# Create multi-step attn backends and cuda graph runners
|
||||
if self.server_args.attention_backend == "flashinfer":
|
||||
if not global_server_args_dict["use_mla_backend"]:
|
||||
from sglang.srt.layers.attention.flashinfer_backend import (
|
||||
@@ -207,7 +207,7 @@ class EAGLEWorker(TpModelWorker):
|
||||
self.draft_model_runner.draft_attn_backend = self.draft_attn_backend
|
||||
|
||||
def init_cuda_graphs(self):
|
||||
"""Capture CUDA graphs."""
|
||||
"""Capture cuda graphs."""
|
||||
self.cuda_graph_runner = None
|
||||
self.cuda_graph_runner_for_draft_extend = None
|
||||
|
||||
@@ -218,12 +218,12 @@ class EAGLEWorker(TpModelWorker):
|
||||
tic = time.time()
|
||||
before_mem = get_available_gpu_memory(self.device, self.gpu_id)
|
||||
logger.info(
|
||||
f"Capture draft CUDA graph begin. This can take up to several minutes. avail mem={before_mem:.2f} GB"
|
||||
f"Capture draft cuda graph begin. This can take up to several minutes. avail mem={before_mem:.2f} GB"
|
||||
)
|
||||
self.cuda_graph_runner = EAGLEDraftCudaGraphRunner(self)
|
||||
after_mem = get_available_gpu_memory(self.device, self.gpu_id)
|
||||
logger.info(
|
||||
f"Capture draft CUDA graph end. Time elapsed: {time.time() - tic:.2f} s. avail mem={after_mem:.2f} GB. mem usage={(before_mem - after_mem):.2f} GB."
|
||||
f"Capture draft cuda graph end. Time elapsed: {time.time() - tic:.2f} s. avail mem={after_mem:.2f} GB. mem usage={(before_mem - after_mem):.2f} GB."
|
||||
)
|
||||
|
||||
# Capture extend
|
||||
|
||||
@@ -1117,7 +1117,7 @@ prometheus_multiproc_dir: tempfile.TemporaryDirectory
|
||||
|
||||
def set_prometheus_multiproc_dir():
|
||||
# Set prometheus multiprocess directory
|
||||
# SGLang uses prometheus multiprocess mode
|
||||
# sglang uses prometheus multiprocess mode
|
||||
# we need to set this before importing prometheus_client
|
||||
# https://prometheus.github.io/client_python/multiprocess/
|
||||
global prometheus_multiproc_dir
|
||||
|
||||
@@ -42,7 +42,7 @@ class MockModelRunner:
|
||||
"TokenPool",
|
||||
(),
|
||||
{
|
||||
# A typical max_bs * max_context_len for CUDA graph decode
|
||||
# A typical max_bs * max_context_len for cuda graph decode
|
||||
"size": max_batch_size,
|
||||
# Add req_to_token attribute
|
||||
"req_to_token": torch.zeros(
|
||||
|
||||
@@ -37,7 +37,7 @@ class MockModelRunner:
|
||||
"TokenPool",
|
||||
(),
|
||||
{
|
||||
# A typical max_bs * max_context_len for CUDA graph decode
|
||||
# A typical max_bs * max_context_len for cuda graph decode
|
||||
"size": batch_size,
|
||||
# Add req_to_token attribute
|
||||
"req_to_token": torch.zeros(
|
||||
|
||||
Reference in New Issue
Block a user