From b5044fbf12d1764444ed8105c4520b3a24db0cea Mon Sep 17 00:00:00 2001 From: Yuan Luo Date: Fri, 10 Oct 2025 12:03:17 +0800 Subject: [PATCH] Replace pad with cat for better performance (#11388) Co-authored-by: luoyuan.luo --- python/sglang/srt/models/dots_vlm_vit.py | 2 +- python/sglang/srt/models/glm4v.py | 2 +- python/sglang/srt/models/qwen2_5_vl.py | 2 +- python/sglang/srt/models/qwen2_vl.py | 2 +- python/sglang/srt/models/qwen3_vl.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/sglang/srt/models/dots_vlm_vit.py b/python/sglang/srt/models/dots_vlm_vit.py index e36e01ee3..b89cb6562 100644 --- a/python/sglang/srt/models/dots_vlm_vit.py +++ b/python/sglang/srt/models/dots_vlm_vit.py @@ -323,7 +323,7 @@ class DotsVisionTransformer(PreTrainedModel): dim=0, dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32, ) - cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0) + cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens]) for blk in self.blocks: hidden_states = blk( diff --git a/python/sglang/srt/models/glm4v.py b/python/sglang/srt/models/glm4v.py index 0aab90a6a..953a86c73 100644 --- a/python/sglang/srt/models/glm4v.py +++ b/python/sglang/srt/models/glm4v.py @@ -434,7 +434,7 @@ class Glm4vVisionModel(nn.Module): cu_seqlens = torch.repeat_interleave( grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0] ).cumsum(dim=0, dtype=torch.int32) - cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0) + cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens]) seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() x = self.embeddings( diff --git a/python/sglang/srt/models/qwen2_5_vl.py b/python/sglang/srt/models/qwen2_5_vl.py index 73d08d42d..e49ba7f1f 100644 --- a/python/sglang/srt/models/qwen2_5_vl.py +++ b/python/sglang/srt/models/qwen2_5_vl.py @@ -436,7 +436,7 @@ class Qwen2_5_VisionTransformer(nn.Module): .to(device=x.device, dtype=torch.int32), ] ) - cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0) + cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens]) # transformers x = x.unsqueeze(1) diff --git a/python/sglang/srt/models/qwen2_vl.py b/python/sglang/srt/models/qwen2_vl.py index a44e94a07..7a42829e8 100644 --- a/python/sglang/srt/models/qwen2_vl.py +++ b/python/sglang/srt/models/qwen2_vl.py @@ -407,7 +407,7 @@ class Qwen2VisionTransformer(nn.Module): cu_seqlens = torch.repeat_interleave( grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0] ).cumsum(dim=0, dtype=torch.int32) - cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0) + cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens]) # transformers x = x.unsqueeze(1) diff --git a/python/sglang/srt/models/qwen3_vl.py b/python/sglang/srt/models/qwen3_vl.py index d37d06e73..8649807a5 100644 --- a/python/sglang/srt/models/qwen3_vl.py +++ b/python/sglang/srt/models/qwen3_vl.py @@ -458,7 +458,7 @@ class Qwen3_VisionTransformer(nn.Module): (grid_thw[:, 0] * grid_thw[:, 1] * grid_thw[:, 2]).cumsum(dim=0), ] ) - cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0) + cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens]) # max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens) x = x.unsqueeze(1)