[gpt-oss] Add gpt-oss bf16 support

2025-08-13 21:25:57 +08:00
parent 5d2e7edf78
commit 17ea2ec6aa
1232 changed files with 777 additions and 36 deletions
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -0,0 +1,318 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any
+
+import torch
+
+import vllm.envs
+from vllm.logger import init_logger
+from vllm.sampling_params import SamplingParams
+from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.utils import LazyLoader
+from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
+                                                     StructuredOutputGrammar,
+                                                     StructuredOutputOptions)
+from vllm.v1.structured_output.utils import (choice_as_grammar,
+                                             convert_lark_to_ebnf,
+                                             grammar_is_likely_lark)
+
+if TYPE_CHECKING:
+    import xgrammar as xgr
+else:
+    xgr = LazyLoader("xgr", globals(), "xgrammar")
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class XgrammarBackend(StructuredOutputBackend):
+
+    def __post_init__(self):
+        self.disable_any_whitespace = \
+            self.vllm_config.decoding_config.disable_any_whitespace
+
+        if isinstance(self.tokenizer, MistralTokenizer):
+            # NOTE: ideally, xgrammar should handle this accordingly.
+            # refer to https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98
+            try:
+                if self.tokenizer.is_tekken:
+                    encoded_vocab = self.tokenizer._vocab
+                else:
+                    encoded_vocab = [
+                        token for token, _ in sorted(
+                            self.tokenizer.get_vocab().items(),
+                            key=lambda x: x[1],
+                        )
+                    ]
+                stop_token_ids = None
+                if (hasattr(
+                        self.tokenizer,
+                        "eos_token_id",
+                ) and self.tokenizer.eos_token_id is not None):
+                    stop_token_ids = [self.tokenizer.eos_token_id]
+            except AttributeError as e:
+                raise ValueError(
+                    f"Cannot get the vocabulary of the tokenizer "
+                    f"{type(self.tokenizer)}. The tokenizer should have a "
+                    "get_vocab method.") from e
+            tokenizer_info = xgr.TokenizerInfo(  # type: ignore
+                encoded_vocab=encoded_vocab,
+                # NOTE: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501
+                vocab_type=xgr.VocabType.RAW
+                if self.tokenizer.is_tekken else xgr.VocabType.BYTE_FALLBACK,
+                vocab_size=self.vocab_size,
+                stop_token_ids=stop_token_ids,
+                add_prefix_space=True,
+            )
+        else:
+            tokenizer_info = xgr.TokenizerInfo.from_huggingface(
+                self.tokenizer,
+                vocab_size=self.vocab_size,
+            )
+        self.compiler = xgr.GrammarCompiler(
+            tokenizer_info,
+            max_threads=8,
+            cache_enabled=True,
+            cache_limit_bytes=vllm.envs.VLLM_XGRAMMAR_CACHE_MB * 1024 * 1024,
+        )
+
+        self.num_speculative_tokens = 0
+        if self.vllm_config.speculative_config is not None:
+            self.num_speculative_tokens = \
+                self.vllm_config.speculative_config.num_speculative_tokens
+
+    def compile_grammar(self, request_type: StructuredOutputOptions,
+                        grammar_spec: str) -> StructuredOutputGrammar:
+        if request_type == StructuredOutputOptions.JSON:
+            ctx = self.compiler.compile_json_schema(
+                grammar_spec, any_whitespace=not self.disable_any_whitespace)
+        elif request_type == StructuredOutputOptions.JSON_OBJECT:
+            ctx = self.compiler.compile_json_schema(
+                '{"type": "object"}',
+                any_whitespace=not self.disable_any_whitespace)
+        elif request_type == StructuredOutputOptions.GRAMMAR:
+            ctx = self.compiler.compile_grammar(grammar_spec)
+        elif request_type == StructuredOutputOptions.REGEX:
+            ctx = self.compiler.compile_regex(grammar_spec)
+        elif request_type == StructuredOutputOptions.STRUCTURAL_TAG:
+            s_tag = json.loads(grammar_spec)
+            tags = [
+                xgr.StructuralTagItem(
+                    begin=s["begin"],
+                    schema=json.dumps(s["schema"]),
+                    end=s["end"],
+                ) for s in s_tag["structures"]
+            ]
+            ctx = self.compiler.compile_structural_tag(tags, s_tag["triggers"])
+        else:
+            logger.error(
+                "Validation should have already occurred. Please file an issue."
+            )
+            raise ValueError(
+                f"grammar is not of valid supported types. ({request_type!s})")
+
+        return XgrammarGrammar(
+            matcher=xgr.GrammarMatcher(
+                ctx,
+                max_rollback_tokens=self.num_speculative_tokens,
+            ),
+            vocab_size=self.vocab_size,
+            ctx=ctx,
+        )
+
+    def allocate_token_bitmask(self, max_num_seqs: int):
+        return xgr.allocate_token_bitmask(max_num_seqs, self.vocab_size)
+
+    def destroy(self):
+        del self.compiler
+
+
+@dataclass
+class XgrammarGrammar(StructuredOutputGrammar):
+    # NOTE: This would be a generic-enough class for
+    # supporting different backends, in the future.
+    # For now, just xgrammar.
+    #
+    # https://xgrammar.mlc.ai/docs/api/python/index.html#xgrammar.GrammarMatcher.find_jump_forward_string
+    # for jump-forward decoding
+
+    vocab_size: int
+    matcher: xgr.GrammarMatcher = field(hash=False)
+    ctx: xgr.CompiledGrammar = field(hash=False)
+    num_processed_tokens: int = field(default_factory=lambda: 0,
+                                      repr=False,
+                                      hash=False,
+                                      init=False)
+
+    def accept_tokens(self, request_id: str, tokens: list[int]) -> bool:
+        """Accepts a list of tokens and advances the FSM.
+
+        Returns True if the FSM was advanced successfully.
+        Returns False if the FSM failed to advance.
+        """
+        for token in tokens:
+            if not self.matcher.accept_token(token):
+                logger.error(
+                    "Failed to advance FSM for request %s "
+                    "for tokens %s. Please file an issue.", request_id, token)
+                return False
+            self.num_processed_tokens += 1
+        return True
+
+    def validate_tokens(self, tokens: list[int]) -> list[int]:
+        """Checks if the list of tokens are accepted by the FSM in sequence.
+        Will not advance the FSM.
+
+        Returns the prefix list of tokens that are accepted by the FSM.
+        """
+        accepted_tokens = []
+        for token in tokens:
+            if self.matcher.accept_token(token):
+                accepted_tokens.append(token)
+            else:
+                break
+        if len(accepted_tokens) > 0:
+            # Rollback the FSM to the initial state
+            self.matcher.rollback(len(accepted_tokens))
+        return accepted_tokens
+
+    def rollback(self, num_tokens: int) -> None:
+        self.matcher.rollback(num_tokens)
+        self.num_processed_tokens -= num_tokens
+
+    def fill_bitmask(self, bitmask: torch.Tensor, idx: int) -> None:
+        self.matcher.fill_next_token_bitmask(bitmask, idx)
+
+    def is_terminated(self) -> bool:
+        return self.matcher.is_terminated()
+
+    def reset(self):
+        self.num_processed_tokens = 0
+        self.matcher.reset()
+
+
+def has_xgrammar_unsupported_json_features(schema: dict[str, Any]) -> bool:
+    """Check if JSON schema contains features unsupported by xgrammar."""
+
+    def check_object(obj: dict[str, Any]) -> bool:
+        if not isinstance(obj, dict):
+            return False
+
+        # Check for numeric ranges
+        if obj.get("type") in ("integer", "number") and ("multipleOf" in obj):
+            return True
+
+        # Check for array unsupported keywords
+        if obj.get("type") == "array" and any(
+                key in obj for key in ("uniqueItems", "contains",
+                                       "minContains", "maxContains")):
+            return True
+
+        # Unsupported keywords for strings
+        if obj.get("type") == "string" and "format" in obj:
+            return True
+
+        # Unsupported keywords for objects
+        if obj.get("type") == "object" and any(
+                key in obj for key in ("minProperties", "maxProperties",
+                                       "propertyNames", "patternProperties")):
+            return True
+
+        # Recursively check all nested objects and arrays
+        for value in obj.values():
+            if isinstance(value, dict):
+                if check_object(value):
+                    return True
+            elif isinstance(value, list):
+                for item in value:
+                    if isinstance(item, dict) and check_object(item):
+                        return True
+
+        return False
+
+    return check_object(schema)
+
+
+def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None:
+    """Validate that the request is supported by structured output.
+
+    Raises ValueError if the request is not supported.
+    """
+    if sampling_params.guided_decoding is None:
+        return
+
+    gd_params = sampling_params.guided_decoding
+
+    if gd_params.regex:
+        try:
+            xgr.Grammar.from_regex(gd_params.regex)
+        except Exception as err:
+            raise ValueError("Failed to transform regex into a grammar: "
+                             f"{err}") from err
+
+    if gd_params.choice:
+        choice_grammar = choice_as_grammar(gd_params.choice)
+        try:
+            xgr.Grammar.from_ebnf(choice_grammar)
+        except Exception as err:
+            raise ValueError("Failed to transform choices into a grammar: "
+                             "{err}") from err
+        gd_params.choice = None
+        gd_params.grammar = choice_grammar
+        return
+
+    if gd_params.json:
+        if isinstance(gd_params.json, str):
+            try:
+                schema = json.loads(gd_params.json)
+            except json.JSONDecodeError as e:
+                raise ValueError("Invalid JSON grammar specification.") from e
+        else:
+            schema = gd_params.json
+
+        try:
+            xgr.Grammar.from_json_schema(schema)
+        except Exception as err:
+            raise ValueError("Failed to transform json schema into a grammar: "
+                             f"{err}") from err
+
+        if has_xgrammar_unsupported_json_features(schema):
+            raise ValueError("The provided JSON schema contains features not "
+                             "supported by xgrammar.")
+        return
+
+    if gd_params.grammar:
+        if grammar_is_likely_lark(gd_params.grammar):
+            # xgrammar supports EBNF grammars only
+            try:
+                gd_params.grammar = convert_lark_to_ebnf(gd_params.grammar)
+            except ValueError as e:
+                raise ValueError(
+                    "Failed to convert the grammar from Lark to EBNF. ") from e
+
+        # Test parsing EBNF grammar, possibly already converted from Lark
+        try:
+            # parse the grammar, but we aren't compiling it.
+            xgr.Grammar.from_ebnf(gd_params.grammar)
+        except Exception as e:
+            raise ValueError("Invalid grammar specification.") from e
+        return
+
+    if gd_params.structural_tag:
+        try:
+            s_tag = json.loads(gd_params.structural_tag)
+            tags = [
+                xgr.StructuralTagItem(
+                    begin=s["begin"],
+                    schema=json.dumps(s["schema"]),
+                    end=s["end"],
+                ) for s in s_tag["structures"]
+            ]
+            xgr.Grammar.from_structural_tag(tags, s_tag["triggers"])
+        except Exception as e:
+            raise ValueError("Invalid structural tag specification.") from e