Fix bugs in sampler with CUDA graph / torch.compile (#1306)

2024-09-02 16:18:48 -07:00
parent 2561ed012c
commit a5a134f39f
4 changed files with 48 additions and 26 deletions
--- a/python/sglang/srt/sampling/sampling_batch_info.py
+++ b/python/sglang/srt/sampling/sampling_batch_info.py
@@ -34,12 +34,14 @@ class SamplingBatchInfo:
    linear_penalties: torch.Tensor = None
    scaling_penalties: torch.Tensor = None

-    def has_bias(self):
+    def can_run_in_cuda_graph(self):
+        # Vocab bias and min_ps are not supported in CUDA graph
        return (
-            self.logit_bias is not None
-            or self.vocab_mask is not None
-            or self.linear_penalties is not None
-            or self.scaling_penalties is not None
+            self.logit_bias is None
+            and self.vocab_mask is None
+            and self.linear_penalties is None
+            and self.scaling_penalties is None
+            and not self.need_min_p_sampling
        )

    @classmethod
@@ -48,35 +50,29 @@ class SamplingBatchInfo:
        ret.temperatures = torch.ones((max_bs, 1), dtype=torch.float, device="cuda")
        ret.top_ps = torch.ones((max_bs,), dtype=torch.float, device="cuda")
        ret.top_ks = torch.ones((max_bs,), dtype=torch.int, device="cuda")
-        ret.min_ps = torch.zeros((max_bs,), dtype=torch.float, device="cuda")
        return ret

    def __getitem__(self, key):
        if isinstance(key, slice):
-            # NOTE: We do not use cuda graph when there is bias tensors
-            assert not self.has_bias()
+            # NOTE:This method is only used in CUDA graph
+            assert self.can_run_in_cuda_graph()
            return SamplingBatchInfo(
                vocab_size=self.vocab_size,
                temperatures=self.temperatures[key],
                top_ps=self.top_ps[key],
                top_ks=self.top_ks[key],
-                min_ps=self.min_ps[key],
-                need_min_p_sampling=self.need_min_p_sampling,
            )
        else:
            raise NotImplementedError

    def inplace_assign(self, bs: int, other: SamplingBatchInfo):
-        # NOTE: We do not use cuda graph when there is bias tensors
-        assert not self.has_bias()
+        # NOTE:This method is only used in CUDA graph
+        assert self.can_run_in_cuda_graph()

        self.vocab_size = other.vocab_size
-        self.need_min_p_sampling = other.need_min_p_sampling
-
        self.temperatures[:bs] = other.temperatures
        self.top_ps[:bs] = other.top_ps
        self.top_ks[:bs] = other.top_ks
-        self.min_ps[:bs] = other.min_ps

    @classmethod
    def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):