upgrade xgrammar to 0.1.19 (#6129)

2025-05-08 17:42:02 -04:00
parent f6f96b0521
commit 911f3ba6f4
2 changed files with 12 additions and 20 deletions
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -42,7 +42,7 @@ runtime_common = [
    "transformers==4.51.1",
    "uvicorn",
    "uvloop",
-    "xgrammar==0.1.17",
+    "xgrammar==0.1.19",
    "blobfile==3.0.0"
 ]
--- a/python/sglang/srt/constrained/xgrammar_backend.py
+++ b/python/sglang/srt/constrained/xgrammar_backend.py
@@ -18,6 +18,7 @@ import logging
 from typing import List, Optional, Tuple, Union
 import torch
 import xgrammar
 from xgrammar import (
    CompiledGrammar,
    GrammarCompiler,
@@ -58,17 +59,11 @@ class XGrammarGrammar(BaseGrammarObject):
        self.override_stop_tokens = override_stop_tokens
        self.finished = False
-        # Fix (from vLLM team): postpone the import of apply_token_bitmask_inplace_kernels to the
+        from xgrammar.kernels.apply_token_bitmask_inplace_cpu import (
-        # class init site to avoid re-initializing CUDA in forked subprocess.
+            apply_token_bitmask_inplace_cpu,
-        from xgrammar.kernels import apply_token_bitmask_inplace_kernels
+        )
-        self.use_token_bitmask_triton = get_bool_env_var(
+        self.apply_vocab_mask_cpu = apply_token_bitmask_inplace_cpu
            "SGLANG_TOKEN_BITMASK_TRITON", "false"
        )
        self.apply_vocab_mask_cuda = apply_token_bitmask_inplace_kernels.get(
            "cuda", None
        )
        self.apply_vocab_mask_cpu = apply_token_bitmask_inplace_kernels.get("cpu", None)
    def accept_token(self, token: int):
        assert self.matcher.accept_token(token)
@@ -113,15 +108,12 @@ class XGrammarGrammar(BaseGrammarObject):
        return vocab_mask.to(device, non_blocking=True)
    def apply_vocab_mask(self, logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
-        if (
+        if logits.device.type == "cuda":
-            not self.use_token_bitmask_triton
+            apply_token_bitmask_inplace_triton(logits, vocab_mask)
-            and logits.device.type == "cuda"
+        elif logits.device.type == "cpu" and self.apply_vocab_mask_cpu:
-            and self.apply_vocab_mask_cuda
+            self.apply_vocab_mask_cpu(logits, vocab_mask)
-        ):
+        else:
-            return self.apply_vocab_mask_cuda(logits, vocab_mask)
+            raise RuntimeError(f"Unsupported device: {logits.device.type}")
        if logits.device.type == "cpu" and self.apply_vocab_mask_cpu:
            return self.apply_vocab_mask_cpu(logits, vocab_mask)
        apply_token_bitmask_inplace_triton(logits, vocab_mask)
    def copy(self):
        matcher = GrammarMatcher(