Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/vllm/v1/structured_output/init.py
+++ b/vllm/v1/structured_output/init.py
@@ -0,0 +1,353 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import multiprocessing
+from concurrent.futures import Future, ThreadPoolExecutor
+from typing import TYPE_CHECKING
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.reasoning import ReasoningParserManager
+from vllm.tokenizers import cached_tokenizer_from_config
+from vllm.utils.import_utils import LazyLoader
+from vllm.v1.structured_output.backend_guidance import GuidanceBackend
+from vllm.v1.structured_output.backend_types import (
+    StructuredOutputBackend,
+    StructuredOutputGrammar,
+)
+from vllm.v1.structured_output.backend_xgrammar import XgrammarBackend
+
+if TYPE_CHECKING:
+    import numpy as np
+    import numpy.typing as npt
+    import torch
+
+    from vllm.reasoning import ReasoningParser
+    from vllm.v1.request import Request
+else:
+    torch = LazyLoader("torch", globals(), "torch")
+
+    ReasoningParser = object
+    Request = object
+
+logger = init_logger(__name__)
+
+
+class StructuredOutputManager:
+    """Engine-level manager for structured output requests."""
+
+    def __init__(self, vllm_config: VllmConfig):
+        self.backend: StructuredOutputBackend | None = None
+        self.reasoner: ReasoningParser | None = None
+        self.vllm_config = vllm_config
+
+        # When in external_launcher mode, async grammar compilation causes deadlocks
+        # due to external_launcher mode having a scheduler for each TP rank.
+        # Async grammar compilation causes the WAITING_FOR_FSM → WAITING transition to
+        # happen at different times on different TP ranks,
+        # breaking the determinism assumption that external_launcher relies on.
+        self._use_async_grammar_compilation = (
+            vllm_config.parallel_config.distributed_executor_backend
+            != "external_launcher"
+        )
+
+        self._grammar_bitmask: torch.Tensor | None = None
+        self._full_mask = torch.tensor(-1, dtype=torch.int32)
+
+        max_batch_size = self.vllm_config.scheduler_config.max_num_seqs
+        self.fill_bitmask_parallel_threshold = 128
+        if self.fill_bitmask_parallel_threshold < max_batch_size:
+            self.fill_bitmask_parallel_batch_size = 16
+            # Use:
+            # - at least 1 CPU
+            # - at most half the number of CPUs or 8, whichever is less
+            max_workers = max(1, min(multiprocessing.cpu_count() // 2, 8))
+            self.executor_for_fillmask = ThreadPoolExecutor(max_workers=max_workers)
+
+        if not self.vllm_config.model_config.skip_tokenizer_init:
+            # The default max_workers if not specified is the number of
+            # CPUs * 5, which is way too high since these tasks are CPU-bound,
+            # not I/O bound. We also know we would never dominate CPU usage
+            # with just grammar compilation, so we set it to half the number
+            # of CPUs.
+            max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
+            self.executor = ThreadPoolExecutor(max_workers=max_workers)
+            self.tokenizer = cached_tokenizer_from_config(
+                model_config=self.vllm_config.model_config
+            )
+            reasoning_parser = (
+                self.vllm_config.structured_outputs_config.reasoning_parser
+            )
+            reasoning_parser_plugin = (
+                self.vllm_config.structured_outputs_config.reasoning_parser_plugin
+            )
+            if reasoning_parser_plugin and len(reasoning_parser_plugin) > 3:
+                ReasoningParserManager.import_reasoning_parser(reasoning_parser_plugin)
+
+            reasoning_parser = (
+                self.vllm_config.structured_outputs_config.reasoning_parser
+            )
+            if reasoning_parser:
+                reasoner_cls = ReasoningParserManager.get_reasoning_parser(
+                    reasoning_parser
+                )
+                self.reasoner = reasoner_cls(tokenizer=self.tokenizer)
+
+        self.enable_in_reasoning = (
+            self.vllm_config.structured_outputs_config.enable_in_reasoning
+        )
+
+    def grammar_init(self, request: Request) -> None:
+        if request.structured_output_request is None:
+            return
+
+        if TYPE_CHECKING:
+            assert (
+                request.sampling_params is not None
+                and request.sampling_params.structured_outputs is not None
+            )
+
+        # Initialize the backend the first time it is needed.
+        #
+        # NOTE: We only support a single backend. We do NOT support different
+        # backends on a per-request basis in V1 (for now, anyway...).
+        # _backend is set in Processor._validate_structured_output
+        if self.backend is None:
+            assert request.sampling_params is not None
+            backend = request.sampling_params.structured_outputs._backend
+            vocab_size = self.vllm_config.model_config.get_vocab_size()
+            if backend == "xgrammar":
+                self.backend = XgrammarBackend(
+                    self.vllm_config,
+                    tokenizer=self.tokenizer,
+                    vocab_size=vocab_size,
+                )
+            elif backend == "guidance":
+                self.backend = GuidanceBackend(
+                    self.vllm_config,
+                    tokenizer=self.tokenizer,
+                    vocab_size=vocab_size,
+                )
+            elif backend == "outlines":
+                from vllm.v1.structured_output.backend_outlines import OutlinesBackend
+
+                self.backend = OutlinesBackend(
+                    self.vllm_config,
+                    tokenizer=self.tokenizer,
+                    vocab_size=vocab_size,
+                )
+            elif backend == "lm-format-enforcer":
+                from vllm.v1.structured_output.backend_lm_format_enforcer import (  # noqa: E501
+                    LMFormatEnforcerBackend,
+                )
+
+                self.backend = LMFormatEnforcerBackend(
+                    self.vllm_config,
+                    tokenizer=self.tokenizer,
+                    vocab_size=vocab_size,
+                )
+            else:
+                raise ValueError(f"Unsupported structured output backend: {backend}")
+
+        if self._use_async_grammar_compilation:
+            grammar = self.executor.submit(self._create_grammar, request)
+        else:
+            grammar = self._create_grammar(request)  # type: ignore[assignment]
+        request.structured_output_request.grammar = grammar  # type: ignore[assignment]
+
+    def _create_grammar(
+        self,
+        request: Request,
+    ) -> StructuredOutputGrammar:
+        key = request.structured_output_request.structured_output_key  # type: ignore[union-attr]
+
+        # Note that the request was validated in the engine core client,
+        # so at this point we know it is a supported type of request.
+        #
+        # TODO: we still need to handle xgrammar compilation failures,
+        # though it should be unlikely as we test that up front as well.
+        request_type, grammar_spec = key
+
+        assert self.backend is not None
+        return self.backend.compile_grammar(request_type, grammar_spec)
+
+    def _fill_bitmasks(
+        self,
+        batch: list[tuple[StructuredOutputGrammar, int, bool]],
+    ) -> None:
+        assert self._grammar_bitmask is not None
+        for grammar, index, apply_bitmask in batch:
+            if apply_bitmask and not grammar.is_terminated():
+                grammar.fill_bitmask(self._grammar_bitmask, index)
+            else:
+                # Note that for thinking support, we will need to
+                # reset the relevant part of the bitmask for consequent
+                # requests here.
+                self._grammar_bitmask[index].fill_(self._full_mask)
+
+    def _async_submit_fill_bitmask(
+        self,
+        batch: list[tuple[StructuredOutputGrammar, int, bool]],
+    ) -> Future:
+        return self.executor_for_fillmask.submit(self._fill_bitmasks, batch)
+
+    def grammar_bitmask(
+        self,
+        requests: dict[str, Request],
+        structured_output_request_ids: list[str],
+        scheduled_spec_decode_tokens: dict[str, list[int]],
+    ) -> "npt.NDArray[np.int32] | None":
+        # Prepare the structured output bitmask for this batch.
+        if not structured_output_request_ids:
+            return None
+
+        max_num_spec_tokens = 0
+        if self.vllm_config.speculative_config is not None:
+            max_num_spec_tokens = (
+                self.vllm_config.speculative_config.num_speculative_tokens
+            )
+
+        if self._grammar_bitmask is None:
+            assert self.backend is not None
+            max_batch_size = self.vllm_config.scheduler_config.max_num_seqs
+
+            # Allocate a bitmask for each token needing to be checked:
+            # one for each speculative position, and one more for the
+            # bonus token / non-speculative token.
+            self._grammar_bitmask = self.backend.allocate_token_bitmask(
+                max_batch_size * (1 + max_num_spec_tokens)
+            )
+
+        # Generate a batched bitmask for all structured output requests.
+        # When speculative decoding is enabled, we need to include multiple
+        # masks for each request, one for each possible bonus token position.
+        # These are stored inline in the tensor and unpacked by the gpu runner.
+        cumulative_index = 0
+
+        # Optimized parallel filling of bitmasks for
+        # non-spec, large-batch-size cases
+        if (
+            len(structured_output_request_ids) > self.fill_bitmask_parallel_threshold
+            and max_num_spec_tokens == 0
+        ):
+            promises = []
+            batch = []
+            for req_id in structured_output_request_ids:
+                request = requests[req_id]
+                structured_output_request = request.structured_output_request
+                if TYPE_CHECKING:
+                    assert structured_output_request is not None
+                    assert structured_output_request.grammar is not None
+
+                apply_bitmask = self.should_fill_bitmask(request)
+                batch.append(
+                    (structured_output_request.grammar, cumulative_index, apply_bitmask)
+                )
+                if len(batch) == self.fill_bitmask_parallel_batch_size:
+                    promises.append(self._async_submit_fill_bitmask(batch))
+                    batch = []
+
+                cumulative_index += 1
+            if batch:
+                promises.append(self._async_submit_fill_bitmask(batch))
+
+            # Wait for all bitmask filling tasks to complete.
+            for promise in promises:
+                promise.result()
+        else:
+            # Fallback to serial filling of bitmasks for small-batch-size cases
+            for req_id in structured_output_request_ids:
+                request = requests[req_id]
+                structured_output_request = request.structured_output_request
+
+                if TYPE_CHECKING:
+                    assert structured_output_request is not None
+                    assert structured_output_request.grammar is not None
+                apply_bitmask = self.should_fill_bitmask(request)
+
+                state_advancements = 0
+                req_tokens = scheduled_spec_decode_tokens.get(req_id, [])
+                for i, token in enumerate(req_tokens + [None]):
+                    self._fill_bitmasks(
+                        [
+                            (
+                                structured_output_request.grammar,
+                                cumulative_index,
+                                apply_bitmask,
+                            )
+                        ]
+                    )
+
+                    if (
+                        apply_bitmask
+                        and token is not None
+                        and not structured_output_request.grammar.is_terminated()
+                    ):
+                        accepted = structured_output_request.grammar.accept_tokens(
+                            req_id, [token]
+                        )
+                        assert accepted, (token, req_id, scheduled_spec_decode_tokens)
+                        state_advancements += 1
+                    cumulative_index += 1
+                if state_advancements > 0:
+                    structured_output_request.grammar.rollback(state_advancements)
+
+        bitmask_tensor = self._grammar_bitmask
+        if cumulative_index < bitmask_tensor.shape[0]:
+            bitmask_tensor = bitmask_tensor[:cumulative_index]
+
+        # After finishing with the xgrammar operations, we convert to
+        # np.ndarray, because that is much more efficient for serialization
+        # and deserialization when sending this to the GPU workers.
+        return bitmask_tensor.numpy()
+
+    def should_fill_bitmask(self, request: Request) -> bool:
+        # NOTE (Hanchen) if enable_in_reasoning is True, it means that
+        # the model needs to be constrained in reasoning. So we should always
+        # enable the bitmask filling.
+
+        if self.reasoner is not None:
+            if self.enable_in_reasoning:
+                return True
+            assert request.structured_output_request is not None
+            if request.structured_output_request.reasoning_ended is None:
+                request.structured_output_request.reasoning_ended = (
+                    self.reasoner.is_reasoning_end(request.prompt_token_ids)
+                )
+            return request.structured_output_request.reasoning_ended
+        return True
+
+    def should_advance(self, request: Request) -> bool:
+        if not request.use_structured_output:
+            return False
+
+        # To determine whether we can advance the FSM.
+        # Supports thinking usage where we skip the reasoning components.
+        if TYPE_CHECKING:
+            assert request.structured_output_request is not None
+            assert request.structured_output_request.grammar is not None
+        # by default, we should always advance
+        # for cases that don't use thinking mode.
+        if self.reasoner is None:
+            return True
+
+        # if the model needs structured in reasoning, we should advance
+        if self.enable_in_reasoning:
+            return True
+
+        structured_req = request.structured_output_request
+        if structured_req.reasoning_ended:
+            return True
+
+        # Check if reasoning ends in *this* step
+        if self.reasoner.is_reasoning_end_streaming(
+            request.all_token_ids, request.all_token_ids[request.num_computed_tokens :]
+        ):
+            # Reasoning just ended, so we shouldn't advance til
+            # next pass
+            structured_req.reasoning_ended = True
+
+        return False
+
+    def clear_backend(self) -> None:
+        if self.backend is not None:
+            self.backend.destroy()
--- a/vllm/v1/structured_output/backend_guidance.py
+++ b/vllm/v1/structured_output/backend_guidance.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import copy
+import json
+import os
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.sampling_params import SamplingParams
+from vllm.utils.import_utils import LazyLoader
+from vllm.v1.structured_output.backend_types import (
+    StructuredOutputBackend,
+    StructuredOutputGrammar,
+    StructuredOutputOptions,
+)
+from vllm.v1.structured_output.request import get_structured_output_key
+
+if TYPE_CHECKING:
+    import llguidance
+    import llguidance.hf as llguidance_hf
+    import llguidance.torch as llguidance_torch
+else:
+    llguidance = LazyLoader("llguidance", globals(), "llguidance")
+    llguidance_hf = LazyLoader("llguidance.hf", globals(), "llguidance.hf")
+    llguidance_torch = LazyLoader("llguidance.torch", globals(), "llguidance.torch")
+
+logger = init_logger(__name__)
+
+
+def _walk_json_for_additional_properties(data: object):
+    if isinstance(data, dict):
+        for value in data.values():
+            _walk_json_for_additional_properties(value)
+        if "additionalProperties" not in data and (
+            "properties" in data or "patternProperties" in data
+        ):
+            data["additionalProperties"] = False
+    elif isinstance(data, list):
+        for item in data:
+            _walk_json_for_additional_properties(item)
+
+
+def process_for_additional_properties(
+    guide_json: str | dict[str, Any],
+) -> dict[str, Any]:
+    if isinstance(guide_json, str):
+        guide_json_obj = json.loads(guide_json)
+    else:
+        # copy for modifications
+        guide_json_obj = copy.deepcopy(guide_json)
+    _walk_json_for_additional_properties(guide_json_obj)
+    return guide_json_obj
+
+
+@dataclass
+class GuidanceBackend(StructuredOutputBackend):
+    def __post_init__(self):
+        self.disable_any_whitespace = (
+            self.vllm_config.structured_outputs_config.disable_any_whitespace
+        )
+        self.disable_additional_properties = (
+            self.vllm_config.structured_outputs_config.disable_additional_properties
+        )
+
+        self.ll_tokenizer = llguidance_hf.from_tokenizer(
+            self.tokenizer, self.vocab_size
+        )
+
+    def compile_grammar(
+        self, request_type: StructuredOutputOptions, grammar_spec: str
+    ) -> StructuredOutputGrammar:
+        self.serialized_grammar = serialize_guidance_grammar(
+            request_type,
+            grammar_spec,
+            self.disable_any_whitespace,
+            self.disable_additional_properties,
+        )
+
+        ll_matcher = llguidance.LLMatcher(
+            self.ll_tokenizer,
+            self.serialized_grammar,
+            log_level=int(os.environ.get("LLGUIDANCE_LOG_LEVEL", "1")),
+        )
+
+        r = GuidanceGrammar(
+            ll_matcher=ll_matcher,
+            ll_tokenizer=self.ll_tokenizer,
+            vocab_size=self.vocab_size,
+        )
+
+        r.check_error()
+        return r
+
+    def allocate_token_bitmask(self, max_num_seqs: int):
+        return llguidance_torch.allocate_token_bitmask(
+            max_num_seqs, self.ll_tokenizer.vocab_size
+        )
+
+    def destroy(self):
+        pass
+
+
+@dataclass
+class GuidanceGrammar(StructuredOutputGrammar):
+    ll_matcher: llguidance.LLMatcher
+    ll_tokenizer: llguidance.LLTokenizer
+    vocab_size: int
+    printed_error: bool = False
+    terminated: bool = False
+    rollback_lag: int = 0
+
+    def check_error(self):
+        if not self.printed_error:
+            err = self.ll_matcher.get_error()
+            if err:
+                self.printed_error = True
+                logger.warning("LLMatcher error: %s", err)
+
+    def accept_tokens(self, request_id: str, tokens: list[int]) -> bool:
+        """Accepts a list of tokens and advances the parser.
+
+        Returns True if the parser was advanced successfully.
+        Returns False if the parser failed to advance.
+        """
+
+        if self.ll_tokenizer.eos_token in tokens:
+            if self.ll_matcher.is_stopped() and not self.terminated:
+                self.rollback_lag = 1
+            self.terminated = True
+
+        if self.ll_matcher.is_stopped():
+            return True
+
+        # TODO - Add jump decoding support in the future:
+        # self.ll_matcher.compute_ff_bytes() - this should always work
+        # self.ll_matcher.compute_ff_tokens() - this only works for
+        #   "canonical" tokenizers
+        # For conversion between the two, see
+        # https://github.com/guidance-ai/llguidance/blob/main/docs/fast_forward.md
+
+        r = self.ll_matcher.consume_tokens(tokens)
+
+        self.check_error()
+
+        return r
+
+    def validate_tokens(self, tokens: list[int]) -> list[int]:
+        """Checks if the list of tokens are accepted by the parser in sequence.
+        Will not advance the parser.
+
+        Returns the prefix list of tokens that are accepted by the parser.
+        """
+        if len(tokens) == 0:
+            return []
+        if self.ll_matcher.is_stopped():
+            return []
+
+        num_tokens = self.ll_matcher.validate_tokens(tokens)
+
+        self.check_error()
+
+        return tokens[:num_tokens]
+
+    def rollback(self, num_tokens: int) -> None:
+        if num_tokens > 0:
+            self.ll_matcher.rollback(num_tokens - self.rollback_lag)
+            self.terminated = False
+            self.rollback_lag = 0
+            self.check_error()
+
+    def fill_bitmask(self, bitmask: torch.Tensor, idx: int) -> None:
+        # this will automatically return [EOS] mask if the matcher is stopped
+        # or otherwise in an error state
+        llguidance_torch.fill_next_token_bitmask(self.ll_matcher, bitmask, idx)
+        self.check_error()
+
+    def is_terminated(self) -> bool:
+        return self.terminated
+
+    def reset(self):
+        # This method may be not needed anymore? TODO
+        self.ll_matcher.reset()
+
+
+def serialize_guidance_grammar(
+    request_type: StructuredOutputOptions,
+    grammar_spec: str | dict[str, Any],
+    disable_any_whitespace: bool = False,
+    disable_additional_properties: bool = False,
+) -> str:
+    def _process_schema(
+        grammar_spec: str | dict[str, Any],
+    ) -> str:
+        if disable_additional_properties:
+            grammar_spec = process_for_additional_properties(grammar_spec)
+        return llguidance.LLMatcher.grammar_from_json_schema(
+            grammar_spec,
+            defaults={
+                "whitespace_flexible": not disable_any_whitespace,
+            },
+        )
+
+    if request_type == StructuredOutputOptions.JSON:
+        return _process_schema(grammar_spec)
+    elif request_type == StructuredOutputOptions.JSON_OBJECT:
+        return llguidance.LLMatcher.grammar_from_json_schema(
+            '{"type": "object"}',
+            defaults={
+                "whitespace_flexible": not disable_any_whitespace,
+            },
+        )
+    else:
+        if request_type == StructuredOutputOptions.REGEX:
+            tp = "regex"
+        elif request_type == StructuredOutputOptions.GRAMMAR:
+            tp = "grammar"
+        elif request_type == StructuredOutputOptions.CHOICE:
+            tp = "choice"
+        elif request_type == StructuredOutputOptions.STRUCTURAL_TAG:
+            if isinstance(grammar_spec, str):
+                s_tag = json.loads(grammar_spec)
+            else:
+                s_tag = grammar_spec
+            triggers: list[str] = s_tag["triggers"]
+            tags: list[llguidance.StructTag] = []
+            for s in s_tag["structures"]:
+                begin: str = s["begin"]
+                trig = next((t for t in triggers if begin.startswith(t)), None)
+                if trig is None:
+                    raise ValueError(
+                        f"Trigger {begin} not found in triggers {triggers}"
+                    )
+                tags.append(
+                    llguidance.StructTag(
+                        trigger=trig,
+                        begin=s["begin"],
+                        grammar=_process_schema(s["schema"]),
+                        end=s["end"],
+                    )
+                )
+            if not tags:
+                raise ValueError("No structural tags found in the grammar spec.")
+            return llguidance.StructTag.to_grammar(tags)
+        else:
+            logger.error(
+                "Validation should have already occurred. Please file an issue."
+            )
+            raise ValueError(
+                f"grammar is not of valid supported types. ({request_type!s})"
+            )
+        return llguidance.grammar_from(tp, grammar_spec)
+
+
+def validate_guidance_grammar(
+    sampling_params: SamplingParams, tokenizer: llguidance.LLTokenizer | None = None
+) -> None:
+    tp, grm = get_structured_output_key(sampling_params.structured_outputs)
+    guidance_grm = serialize_guidance_grammar(tp, grm)
+    err = llguidance.LLMatcher.validate_grammar(guidance_grm, tokenizer)
+    if err:
+        raise ValueError(f"Grammar error: {err}")
--- a/vllm/v1/structured_output/backend_lm_format_enforcer.py
+++ b/vllm/v1/structured_output/backend_lm_format_enforcer.py
@@ -0,0 +1,177 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import ast
+import json
+from dataclasses import dataclass, field
+from functools import lru_cache
+from typing import TYPE_CHECKING
+
+import torch
+from transformers import PreTrainedTokenizerBase
+
+from vllm.sampling_params import SamplingParams
+from vllm.utils.import_utils import LazyLoader
+from vllm.v1.structured_output.backend_types import (
+    StructuredOutputBackend,
+    StructuredOutputGrammar,
+    StructuredOutputOptions,
+)
+
+if TYPE_CHECKING:
+    import lmformatenforcer
+    import lmformatenforcer.integrations.vllm as lmfe_vllm
+else:
+    lmformatenforcer = LazyLoader("lmformatenforcer", globals(), "lmformatenforcer")
+    lmfe_vllm = LazyLoader(
+        "lmformatenforcer.integrations.vllm",
+        globals(),
+        "lmformatenforcer.integrations.vllm",
+    )
+
+
+@lru_cache
+def _cached_build_vllm_token_enforcer_tokenizer_data(
+    tokenizer: PreTrainedTokenizerBase, vocab_size: int
+) -> "lmfe_vllm.TokenEnforcerTokenizerData":
+    return lmfe_vllm.build_vllm_token_enforcer_tokenizer_data(
+        tokenizer, use_bitmask=True, vocab_size=vocab_size
+    )
+
+
+@dataclass
+class LMFormatEnforcerGrammar(StructuredOutputGrammar):
+    token_enforcer: lmformatenforcer.TokenEnforcer
+    current_tokens_prefix: list[int] = field(default_factory=list)
+
+    def accept_tokens(self, request_id: str, tokens: list[int]) -> bool:
+        original_len = len(self.current_tokens_prefix)
+        for token in tokens:
+            if not self.token_enforcer.get_allowed_tokens(
+                self.current_tokens_prefix
+            ).is_token_allowed(token):
+                # Rollback partial updates to ensure atomicity.
+                del self.current_tokens_prefix[original_len:]
+                return False
+            self.current_tokens_prefix.append(token)
+        return True
+
+    def validate_tokens(self, tokens: list[int]) -> list[int]:
+        for prefix_length in range(len(tokens)):
+            prefix = tokens[:prefix_length]
+            next_token = tokens[prefix_length]
+            if not self.token_enforcer.get_allowed_tokens(
+                self.current_tokens_prefix + prefix
+            ).is_token_allowed(next_token):
+                break
+        else:
+            return tokens
+
+        return tokens[:prefix_length]
+
+    def rollback(self, num_tokens: int) -> None:
+        self.current_tokens_prefix = self.current_tokens_prefix[:-num_tokens]
+
+    def fill_bitmask(self, bitmask: torch.Tensor, batch_index: int) -> None:
+        allowed_tokens = self.token_enforcer.get_allowed_tokens(
+            self.current_tokens_prefix
+        )
+        bitmask[batch_index] = allowed_tokens.allowed_tokens
+
+    def is_terminated(self) -> bool:
+        # We are considered terminated if the prefix ends with eos_token_id
+        return_value = (
+            len(self.current_tokens_prefix) > 0
+            and self.current_tokens_prefix[-1] == self.token_enforcer.eos_token_id
+        )
+        return return_value
+
+    def reset(self):
+        self.current_tokens_prefix = []
+
+
+@dataclass
+class LMFormatEnforcerBackend(StructuredOutputBackend):
+    def __post_init__(self):
+        self.tokenizer_data = _cached_build_vllm_token_enforcer_tokenizer_data(
+            self.tokenizer, self.vocab_size
+        )
+
+    def compile_grammar(
+        self, request_type: StructuredOutputOptions, grammar_spec: str
+    ) -> StructuredOutputGrammar:
+        character_level_parser: lmformatenforcer.CharacterLevelParser
+        if request_type == StructuredOutputOptions.JSON:
+            spec_dict = json.loads(grammar_spec)
+            character_level_parser = lmformatenforcer.JsonSchemaParser(spec_dict)
+        elif request_type == StructuredOutputOptions.JSON_OBJECT:
+            character_level_parser = lmformatenforcer.JsonSchemaParser(None)
+        elif request_type == StructuredOutputOptions.REGEX:
+            character_level_parser = lmformatenforcer.RegexParser(grammar_spec)
+        elif request_type == StructuredOutputOptions.CHOICE:
+            choices = ast.literal_eval(grammar_spec)
+            character_level_parser = lmformatenforcer.UnionParser(
+                [lmformatenforcer.StringParser(choice) for choice in choices]
+            )
+        else:
+            raise ValueError(
+                f"Invalid request type for LM Format Enforcer backend({request_type!s})"
+            )
+        max_rollback_tokens = (
+            self.vllm_config.speculative_config.num_speculative_tokens
+            if self.vllm_config.speculative_config is not None
+            else 0
+        )
+
+        if max_rollback_tokens > 0:
+            raise ValueError(
+                "LM Format Enforcer backend does not support speculative tokens"
+            )
+
+        token_enforcer = lmformatenforcer.TokenEnforcer(
+            tokenizer_data=self.tokenizer_data,
+            parser=character_level_parser,
+        )
+        return LMFormatEnforcerGrammar(token_enforcer)
+
+    def allocate_token_bitmask(self, max_num_seqs: int) -> torch.Tensor:
+        return torch.full(
+            (max_num_seqs, (self.vocab_size + 31) // 32),
+            -1,
+            dtype=torch.int32,
+            pin_memory=torch.cuda.is_available(),
+        )
+
+    def destroy(self):
+        pass
+
+
+def validate_structured_output_request_lm_format_enforcer(params: SamplingParams):
+    if params.structured_outputs is None:
+        return
+
+    so_params = params.structured_outputs
+
+    if so_params.regex:
+        return
+    elif so_params.json:
+        if isinstance(so_params.json, str):
+            try:
+                # make sure schema is valid json
+                json.loads(so_params.json)
+            except json.JSONDecodeError as e:
+                raise ValueError("Invalid JSON grammar specification.") from e
+        else:
+            try:
+                json.dumps(so_params.json)
+            except Exception as e:
+                raise ValueError(
+                    f"Error serializing structured outputs jsonschema: {e}"
+                ) from e
+        return
+    elif so_params.choice:
+        return
+    elif so_params.grammar:
+        raise ValueError(
+            "LM Format Enforcer structured outputs backend "
+            "does not support grammar specifications"
+        )
--- a/vllm/v1/structured_output/backend_outlines.py
+++ b/vllm/v1/structured_output/backend_outlines.py
@@ -0,0 +1,324 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright 2025-present the Outlines developers
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import ast
+import importlib
+import json
+import sys
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING
+
+import torch
+from regex import escape as regex_escape
+
+from vllm.sampling_params import SamplingParams
+from vllm.utils.import_utils import LazyLoader
+from vllm.v1.structured_output.backend_types import (
+    StructuredOutputBackend,
+    StructuredOutputGrammar,
+    StructuredOutputOptions,
+)
+from vllm.v1.structured_output.utils import (
+    OutlinesVocabulary,
+    get_outlines_cache,
+    get_outlines_vocabulary,
+)
+
+if TYPE_CHECKING:
+    import outlines_core as oc
+    import outlines_core.json_schema as json_schema
+else:
+    oc = LazyLoader("oc", globals(), "outlines_core")
+    json_schema = LazyLoader("json_schema", globals(), "outlines_core.json_schema")
+
+# Python 3.11+ sre_parse and sre_constants
+# are deprecated, so we must import them from re
+if sys.version_info >= (3, 11):
+    # Hack to get around pre-commit regex module rule
+    # because going through re is the only way to get sre_parse
+    # and sre_constants in Python 3.11+
+    _re = importlib.import_module("re")
+    sre_parse = _re._parser
+    sre_constants = _re._constants
+else:
+    import sre_constants
+    import sre_parse
+
+
+@dataclass
+class OutlinesBackend(StructuredOutputBackend):
+    def __post_init__(self):
+        self.vocabulary = get_outlines_vocabulary(self.tokenizer)
+        self.cache = get_outlines_cache()
+
+    def _compile_index(
+        self, regex_string: str, vocabulary: OutlinesVocabulary
+    ) -> oc.Index:
+        cache_key = f"{vocabulary._hash}_{regex_string}"
+        if cache_key in self.cache:
+            return self.cache[cache_key]
+
+        index = oc.Index(regex_string, vocabulary.inner)
+        self.cache[cache_key] = index
+
+        return index
+
+    def compile_grammar(
+        self, request_type: StructuredOutputOptions, grammar_spec: str
+    ) -> StructuredOutputGrammar:
+        if request_type == StructuredOutputOptions.JSON:
+            regex = json_schema.build_regex_from_schema(grammar_spec)
+        elif request_type == StructuredOutputOptions.REGEX:
+            regex = grammar_spec
+        elif request_type == StructuredOutputOptions.CHOICE:
+            choices = ast.literal_eval(grammar_spec)
+            choices = [regex_escape(c) for c in choices]
+            regex = "(" + "|".join(choices) + ")"
+        else:
+            raise ValueError(
+                f"Invalid request type for Outlines backend ({request_type!s})"
+            )
+        index = self._compile_index(regex, self.vocabulary)
+        max_rollback_tokens = (
+            self.vllm_config.speculative_config.num_speculative_tokens
+            if self.vllm_config.speculative_config is not None
+            else 0
+        )
+        return OutlinesGrammar(
+            vocab_size=self.vocab_size,
+            guide=oc.Guide(index, max_rollback=max_rollback_tokens),
+        )
+
+    def allocate_token_bitmask(self, max_num_seqs: int) -> torch.Tensor:
+        return torch.full(
+            (max_num_seqs, (self.vocab_size + 31) // 32),
+            -1,
+            dtype=torch.int32,
+            pin_memory=torch.cuda.is_available(),
+        )
+
+    def destroy(self):
+        pass
+
+
+@dataclass
+class OutlinesGrammar(StructuredOutputGrammar):
+    vocab_size: int
+    guide: oc.Guide = field(hash=False)
+    num_processed_tokens: int = field(
+        default_factory=lambda: 0, repr=False, hash=False, init=False
+    )
+
+    # outlines_core signals done on DFA accept; vLLM expects done after EOS.
+    # We delay the finished flag by one step so EOS can still be emitted.
+    _prev_finished: bool = field(default=False, init=False, repr=False, hash=False)
+
+    def accept_tokens(self, request_id: str, tokens: list[int]) -> bool:
+        """Accepts a list of tokens and advances the FSM.
+
+        Returns True if the FSM was advanced successfully.
+        Returns False if the FSM failed to advance.
+        """
+        if self.guide.accepts_tokens(tokens):
+            # Advance cannot fail because we checked Guide.accepts_tokens()
+            for t in tokens:
+                self.guide.advance(t)
+                self.num_processed_tokens += 1
+            return True
+        return False
+
+    def rollback(self, num_tokens: int) -> None:
+        self.guide.rollback_state(num_tokens)
+        self.num_processed_tokens -= num_tokens
+
+    def validate_tokens(self, tokens: list[int]) -> list[int]:
+        accepted: list[int] = []
+        for tok in tokens:
+            accepted.append(tok)
+            if not self.guide.accepts_tokens(accepted):
+                accepted.pop()
+                break
+        return accepted
+
+    def fill_bitmask(self, bitmask: torch.Tensor, idx: int) -> None:
+        mask = bitmask[idx]
+        self.guide.write_mask_into(mask.data_ptr(), mask.numel(), mask.element_size())
+
+    def is_terminated(self) -> bool:
+        curr = self.guide.is_finished()
+        prev = self._prev_finished
+        self._prev_finished = curr
+        return prev
+
+    def reset(self):
+        self.num_processed_tokens = 0
+        self._prev_finished = False
+        self.guide.reset()
+
+
+def validate_structured_output_request_outlines(params: SamplingParams):
+    if params.structured_outputs is None:
+        return
+
+    so_params = params.structured_outputs
+
+    if so_params.regex:
+        validate_regex_is_buildable(so_params.regex)
+    elif so_params.json:
+        if isinstance(so_params.json, str):
+            try:
+                # make sure schema is valid json
+                json.loads(so_params.json)
+                schema = so_params.json
+            except json.JSONDecodeError as e:
+                raise ValueError("Invalid JSON grammar specification.") from e
+        else:
+            try:
+                schema = json.dumps(so_params.json)
+            except Exception as e:
+                raise ValueError(
+                    f"Error serializing structured outputs jsonschema: {e}"
+                ) from e
+        pattern = json_schema.build_regex_from_schema(schema)
+        validate_regex_is_buildable(pattern)
+    elif so_params.choice:
+        choices = [regex_escape(str(choice)) for choice in so_params.choice]
+        regex = "(" + "|".join(choices) + ")"
+        validate_regex_is_buildable(regex)
+    elif so_params.grammar:
+        raise ValueError(
+            "Outlines structured outputs backend "
+            "does not support grammar specifications"
+        )
+
+
+def _prefix_needs_context(parsed) -> bool:
+    """Return True if there's a look-around/anchor before any consumer."""
+
+    def subpattern_consumes(parsed) -> bool:
+        """Return True if subpattern can consume at least one character."""
+        tokens = parsed.data if hasattr(parsed, "data") else parsed
+        for ttype, tval in tokens:
+            # literal, character class, or dot always consumes
+            if ttype in (sre_parse.LITERAL, sre_parse.IN, sre_parse.ANY):
+                return True
+            # quantified subpattern: check inner pattern
+            elif ttype == sre_parse.MAX_REPEAT:
+                _, mx, sub = tval
+                if mx != 0 and subpattern_consumes(sub):
+                    return True
+            # alternation: if any branch consumes, the whole does
+            elif ttype == sre_parse.BRANCH:
+                _, branches = tval
+                if any(subpattern_consumes(br) for br in branches):
+                    return True
+            # grouped subpattern: recurse into its contents
+            elif ttype == sre_parse.SUBPATTERN and subpattern_consumes(tval[3]):
+                return True
+        # No consumers, return False
+        return False
+
+    tokens = parsed.data if hasattr(parsed, "data") else parsed
+    for ttype, tval in tokens:
+        # Direct anchors or look-around
+        if ttype == sre_parse.AT or ttype in (
+            sre_constants.ASSERT,
+            sre_constants.ASSERT_NOT,
+        ):
+            return True
+
+        # Nested subpattern: check
+        if ttype == sre_parse.SUBPATTERN:
+            # tval: (group, add_flags, del_flags, subpattern)
+            if _prefix_needs_context(tval[3]):
+                return True
+            if subpattern_consumes(tval[3]):
+                return False
+
+        # if any branch has a prefix anchor => True,
+        # else if at least one branch consumes => prefix ends => False
+        elif ttype == sre_parse.BRANCH:
+            saw_consumer = False
+            for br in tval[1]:
+                if _prefix_needs_context(br):
+                    return True
+                if subpattern_consumes(br):
+                    saw_consumer = True
+            if saw_consumer:
+                return False
+
+        # Immediate consumer tokens
+        elif ttype in (sre_parse.LITERAL, sre_parse.IN, sre_parse.ANY):
+            return False
+
+        # if subpattern has anchor => True, if it can consume => stop
+        elif ttype == sre_parse.MAX_REPEAT:
+            if _prefix_needs_context(tval[2]):
+                return True
+            if subpattern_consumes(tval[2]):
+                return False
+
+    return False
+
+
+def _check_unsupported(parsed) -> None:
+    """Check for regex features unsupported by regex-automata"""
+    tokens = parsed.data if hasattr(parsed, "data") else parsed
+    for ttype, tval in tokens:
+        # backreference
+        if ttype in (sre_parse.GROUPREF, sre_parse.GROUPREF_EXISTS):
+            raise ValueError("Backreferences are unsupported.")
+
+        # look-around assertion
+        elif ttype in (sre_constants.ASSERT, sre_constants.ASSERT_NOT):
+            raise ValueError("Look-Around assertion are unsupported.")
+
+        # unicode word boundaries
+        elif ttype == sre_parse.AT:
+            if tval in (sre_constants.AT_BOUNDARY, sre_constants.AT_NON_BOUNDARY):
+                raise ValueError("Unicode word boundaries are unsupported.")
+
+        elif ttype == sre_parse.BRANCH:
+            # tval is (None, branches)
+            for branch in tval[1]:
+                _check_unsupported(branch)
+
+        # tval is (min, max, subpattern)
+        elif ttype == sre_parse.MAX_REPEAT:
+            _check_unsupported(tval[2])
+
+
+def validate_regex_is_buildable(pattern: str) -> None:
+    """
+    Validates that the input regex is not using unsupported features
+    of the `regex-automata` crate (outlines_core regex engine) and has a
+    universal start state.
+    definition of universal start state used can be found at:
+    https://docs.rs/regex-automata/latest/regex_automata/dfa/trait.Automaton.html#method.universal_start_state
+    """
+    try:
+        parsed = sre_parse.parse(pattern)
+
+    except sre_constants.error as e:
+        raise ValueError(f"Error parsing regex: {e}") from e
+
+    try:
+        _check_unsupported(parsed)
+    except ValueError as e:
+        raise ValueError(
+            f"Regex uses unsupported feature for structured outputs: {e}. "
+            "Only basic matching constructs are supported—lookarounds, "
+            "backreferences, and unicode boundaries are not."
+        ) from e
+
+    if _prefix_needs_context(parsed):
+        raise ValueError(
+            "Regex does not have a anchored universal start state"
+            "This means that the Regex uses anchors (^) or look-arounds "
+            "in a way which requires context before any token is matched."
+            "structured outputs needs regexes that can match without needing "
+            "that context. Try rewriting the pattern without using these "
+            f"constructs. Pattern:\n{pattern}"
+        )
--- a/vllm/v1/structured_output/backend_types.py
+++ b/vllm/v1/structured_output/backend_types.py
@@ -0,0 +1,136 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import enum
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    import torch
+
+    from vllm.config import VllmConfig
+    from vllm.tokenizers import TokenizerLike
+else:
+    VllmConfig = object
+    TokenizerLike = object
+
+
+class StructuredOutputOptions(enum.Enum):
+    JSON = enum.auto()
+    JSON_OBJECT = enum.auto()
+    REGEX = enum.auto()
+    GRAMMAR = enum.auto()
+    CHOICE = enum.auto()
+    STRUCTURAL_TAG = enum.auto()
+
+
+StructuredOutputKey = tuple[StructuredOutputOptions, str]
+
+
+class StructuredOutputGrammar(ABC):
+    """Request-level backend for structured output requests."""
+
+    @abstractmethod
+    def accept_tokens(self, request_id: str, tokens: list[int]) -> bool:
+        """
+        Determines whether the provided tokens are accepted for the
+        given request.
+
+        Args:
+            request_id (str): The unique identifier for the request.
+            tokens (list[int]): A list of token IDs to evaluate.
+
+        Returns:
+            bool: True if the tokens are accepted, False otherwise.
+        """
+
+    @abstractmethod
+    def validate_tokens(self, tokens: list[int]) -> list[int]:
+        """
+        Validates the provided tokens against the grammar.
+        Will not advance the FSM.
+
+        Args:
+            tokens (list[int]): A list of token IDs to validate.
+
+        Returns:
+            list[int]: A list of accepted token IDs. Will be a prefix
+                of the input tokens, and empty if none are accepted.
+        """
+
+    @abstractmethod
+    def rollback(self, num_tokens: int) -> None:
+        """
+        Rolls back the state of the grammar by a specified number of tokens.
+        Will also revert counters for the number of processed tokens.
+
+        Args:
+            num_tokens (int): The number of tokens to roll back.
+        """
+
+    @abstractmethod
+    def fill_bitmask(self, bitmask: "torch.Tensor", batch_index: int) -> None:
+        """
+        Fills the bitmask for a specific batch index.
+
+        Args:
+            bitmask (torch.Tensor): The bitmask to fill
+            batch_index (int): The index in the bitmask to fill
+        """
+
+    @abstractmethod
+    def is_terminated(self) -> bool:
+        """
+        Checks whether the structured output process has terminated.
+
+        Returns:
+            bool: True if the process is terminated, False otherwise.
+        """
+
+    @abstractmethod
+    def reset(self):
+        """
+        Resets the state of the structured output grammar.
+        """
+
+
+@dataclass
+class StructuredOutputBackend(ABC):
+    """Engine-level backend for structured output requests."""
+
+    vllm_config: VllmConfig
+    tokenizer: TokenizerLike
+    vocab_size: int
+
+    @abstractmethod
+    def compile_grammar(
+        self, request_type: StructuredOutputOptions, grammar_spec: str
+    ) -> StructuredOutputGrammar:
+        """
+        Compiles a grammar specification into a structured output grammar.
+
+        Args:
+            request_type (StructuredOutputOptions): The type of structured
+                output request.
+            grammar_spec (str): The grammar specification to compile.
+
+        Returns:
+            StructuredOutputGrammar: The compiled structured output grammar.
+        """
+
+    @abstractmethod
+    def allocate_token_bitmask(self, max_num_seqs: int) -> "torch.Tensor":
+        """
+        Allocates a token bitmask for the specified maximum number of sequences.
+
+        Args:
+            max_num_seqs (int): The maximum number of sequences for which
+                to allocate the bitmask.
+        """
+
+    @abstractmethod
+    def destroy(self):
+        """
+        Backend-specific cleanup.
+        """
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -0,0 +1,378 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any
+
+import torch
+
+import vllm.envs
+from vllm.logger import init_logger
+from vllm.sampling_params import SamplingParams
+from vllm.tokenizers.deepseek_v32 import DeepseekV32Tokenizer
+from vllm.tokenizers.mistral import MistralTokenizer
+from vllm.utils.import_utils import LazyLoader
+from vllm.v1.structured_output.backend_types import (
+    StructuredOutputBackend,
+    StructuredOutputGrammar,
+    StructuredOutputOptions,
+)
+from vllm.v1.structured_output.utils import (
+    choice_as_grammar,
+    convert_lark_to_ebnf,
+    grammar_is_likely_lark,
+)
+
+if TYPE_CHECKING:
+    import xgrammar as xgr
+else:
+    xgr = LazyLoader("xgr", globals(), "xgrammar")
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class XgrammarBackend(StructuredOutputBackend):
+    def __post_init__(self):
+        self.disable_any_whitespace = (
+            self.vllm_config.structured_outputs_config.disable_any_whitespace
+        )
+
+        if isinstance(self.tokenizer, MistralTokenizer):
+            # NOTE: ideally, xgrammar should handle this accordingly.
+            # refer to https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98
+            stop_token_ids = [self.tokenizer.eos_token_id]
+
+            # not self.tokenizer.vocab_size as self.tokenizer.vocab
+            # collapses all decoded errors into a single token.
+            self.vocab_size = len(self.tokenizer.vocab)
+            tokenizer_info = xgr.TokenizerInfo(  # type: ignore
+                encoded_vocab=self.tokenizer.vocab,
+                # NOTE: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501
+                vocab_type=xgr.VocabType.RAW
+                if self.tokenizer.is_tekken
+                else xgr.VocabType.BYTE_FALLBACK,
+                vocab_size=self.vocab_size,
+                stop_token_ids=stop_token_ids,
+                add_prefix_space=True,
+            )
+        elif isinstance(self.tokenizer, DeepseekV32Tokenizer):
+            # copy from xgr.TokenizerInfo.from_huggingface()
+            # because we are using a custom tokenizer wrapper here.
+            vocab_dict = self.tokenizer.get_vocab()
+            tokenizer_vocab_size = max(len(vocab_dict), self.tokenizer.max_token_id + 1)
+            vocab_size = self.vocab_size or tokenizer_vocab_size
+            # maintain tokenizer's indexing
+            encoded_vocab = [""] * vocab_size
+            for token, idx in vocab_dict.items():
+                if idx < vocab_size:
+                    encoded_vocab[idx] = token
+            stop_token_ids = [self.tokenizer.eos_token_id]
+            backend_str = self.tokenizer.tokenizer.backend_tokenizer.to_str()
+            metadata = xgr.TokenizerInfo._detect_metadata_from_hf(backend_str)
+            tokenizer_info = xgr.TokenizerInfo(
+                encoded_vocab=encoded_vocab,
+                vocab_type=metadata["vocab_type"],
+                vocab_size=vocab_size,
+                stop_token_ids=stop_token_ids,
+                add_prefix_space=metadata["add_prefix_space"],
+            )
+        else:
+            tokenizer_info = xgr.TokenizerInfo.from_huggingface(
+                self.tokenizer,
+                vocab_size=self.vocab_size,
+            )
+        self.compiler = xgr.GrammarCompiler(
+            tokenizer_info,
+            max_threads=8,
+            cache_enabled=True,
+            cache_limit_bytes=vllm.envs.VLLM_XGRAMMAR_CACHE_MB * 1024 * 1024,
+        )
+
+        self.num_speculative_tokens = 0
+        if self.vllm_config.speculative_config is not None:
+            self.num_speculative_tokens = (
+                self.vllm_config.speculative_config.num_speculative_tokens
+            )
+
+    def compile_grammar(
+        self, request_type: StructuredOutputOptions, grammar_spec: str
+    ) -> StructuredOutputGrammar:
+        if request_type == StructuredOutputOptions.JSON:
+            ctx = self.compiler.compile_json_schema(
+                grammar_spec, any_whitespace=not self.disable_any_whitespace
+            )
+        elif request_type == StructuredOutputOptions.JSON_OBJECT:
+            ctx = self.compiler.compile_json_schema(
+                '{"type": "object"}', any_whitespace=not self.disable_any_whitespace
+            )
+        elif request_type == StructuredOutputOptions.GRAMMAR:
+            ctx = self.compiler.compile_grammar(grammar_spec)
+        elif request_type == StructuredOutputOptions.REGEX:
+            ctx = self.compiler.compile_regex(grammar_spec)
+        elif request_type == StructuredOutputOptions.STRUCTURAL_TAG:
+            s_tag = json.loads(grammar_spec)
+            if "structures" in s_tag:
+                # Falling back to deprecated method of compiling structural tag
+                tags = [
+                    xgr.StructuralTagItem(
+                        begin=s["begin"],
+                        schema=json.dumps(s["schema"]),
+                        end=s["end"],
+                    )
+                    for s in s_tag["structures"]
+                ]
+                ctx = self.compiler.compile_structural_tag(tags, s_tag["triggers"])
+            else:
+                ctx = self.compiler.compile_structural_tag(grammar_spec)
+        else:
+            logger.error(
+                "Validation should have already occurred. Please file an issue."
+            )
+            raise ValueError(
+                f"grammar is not of valid supported types. ({request_type!s})"
+            )
+
+        return XgrammarGrammar(
+            matcher=xgr.GrammarMatcher(
+                ctx,
+                max_rollback_tokens=self.num_speculative_tokens,
+            ),
+            vocab_size=self.vocab_size,
+            ctx=ctx,
+        )
+
+    def allocate_token_bitmask(self, max_num_seqs: int):
+        return xgr.allocate_token_bitmask(max_num_seqs, self.vocab_size)
+
+    def destroy(self):
+        del self.compiler
+
+
+@dataclass
+class XgrammarGrammar(StructuredOutputGrammar):
+    # NOTE: This would be a generic-enough class for
+    # supporting different backends, in the future.
+    # For now, just xgrammar.
+    #
+    # https://xgrammar.mlc.ai/docs/api/python/index.html#xgrammar.GrammarMatcher.find_jump_forward_string
+    # for jump-forward decoding
+
+    vocab_size: int
+    matcher: xgr.GrammarMatcher = field(hash=False)
+    ctx: xgr.CompiledGrammar = field(hash=False)
+    num_processed_tokens: int = field(
+        default_factory=lambda: 0, repr=False, hash=False, init=False
+    )
+    _is_terminated: bool = field(default=False, repr=False, hash=False)
+
+    def accept_tokens(self, request_id: str, tokens: list[int]) -> bool:
+        """Accepts a list of tokens and advances the FSM.
+
+        Returns True if the FSM was advanced successfully.
+        Returns False if the FSM failed to advance.
+        """
+        if self._is_terminated:
+            return False
+        for token in tokens:
+            if not self.matcher.accept_token(token):
+                logger.error(
+                    "Failed to advance FSM for request %s "
+                    "for tokens %s. Please file an issue.",
+                    request_id,
+                    token,
+                )
+                return False
+            self.num_processed_tokens += 1
+        self._is_terminated = self.matcher.is_terminated()
+        return True
+
+    def validate_tokens(self, tokens: list[int]) -> list[int]:
+        """Checks if the list of tokens are accepted by the FSM in sequence.
+        Will not advance the FSM.
+
+        Returns the prefix list of tokens that are accepted by the FSM.
+        """
+        accepted_tokens = []
+        for token in tokens:
+            if self.matcher.accept_token(token):
+                accepted_tokens.append(token)
+            else:
+                break
+        if len(accepted_tokens) > 0:
+            # Rollback the FSM to the initial state
+            self.matcher.rollback(len(accepted_tokens))
+        return accepted_tokens
+
+    def rollback(self, num_tokens: int) -> None:
+        self.matcher.rollback(num_tokens)
+        self.num_processed_tokens -= num_tokens
+        self._is_terminated = self.matcher.is_terminated()
+
+    def fill_bitmask(self, bitmask: torch.Tensor, idx: int) -> None:
+        self.matcher.fill_next_token_bitmask(bitmask, idx)
+
+    def is_terminated(self) -> bool:
+        return self._is_terminated
+
+    def reset(self):
+        self.num_processed_tokens = 0
+        self.matcher.reset()
+
+
+# cf https://github.com/mlc-ai/xgrammar/blob/a32ac892676d2eedc0327416105b9b06edfb94b2/cpp/json_schema_converter.cc
+STRING_SUPPORTED_FORMATS = {
+    "email",
+    "date",
+    "time",
+    "date-time",
+    "duration",
+    "ipv4",
+    "ipv6",
+    "hostname",
+    "uuid",
+    "uri",
+    "uri-reference",
+    "uri-template",
+    "json-pointer",
+    "relative-json-pointer",
+}
+
+
+def has_xgrammar_unsupported_json_features(schema: dict[str, Any]) -> bool:
+    """Check if JSON schema contains features unsupported by xgrammar."""
+
+    def check_object(obj: dict[str, Any]) -> bool:
+        if not isinstance(obj, dict):
+            return False
+
+        # Check for numeric ranges
+        if obj.get("type") in ("integer", "number") and ("multipleOf" in obj):
+            return True
+
+        # Check for array unsupported keywords
+        if obj.get("type") == "array" and any(
+            key in obj
+            for key in ("uniqueItems", "contains", "minContains", "maxContains")
+        ):
+            return True
+
+        # Unsupported keywords for strings
+        if (
+            obj.get("type") == "string"
+            and "format" in obj
+            and obj["format"] not in STRING_SUPPORTED_FORMATS
+        ):
+            return True
+
+        # Unsupported keywords for objects
+        if obj.get("type") == "object" and any(
+            key in obj for key in ("patternProperties", "propertyNames")
+        ):
+            return True
+
+        # Recursively check all nested objects and arrays
+        for value in obj.values():
+            if isinstance(value, dict):
+                if check_object(value):
+                    return True
+            elif isinstance(value, list):
+                for item in value:
+                    if isinstance(item, dict) and check_object(item):
+                        return True
+
+        return False
+
+    return check_object(schema)
+
+
+def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None:
+    """Validate that the request is supported by structured output.
+
+    Raises ValueError if the request is not supported.
+    """
+    if sampling_params.structured_outputs is None:
+        return
+
+    so_params = sampling_params.structured_outputs
+
+    if so_params.regex:
+        try:
+            xgr.Grammar.from_regex(so_params.regex)
+        except Exception as err:
+            raise ValueError(
+                f"Failed to transform regex into a grammar: {err}"
+            ) from err
+
+    if so_params.choice:
+        choice_grammar = choice_as_grammar(so_params.choice)
+        try:
+            xgr.Grammar.from_ebnf(choice_grammar)
+        except Exception as err:
+            raise ValueError(
+                "Failed to transform choices into a grammar: {err}"
+            ) from err
+        so_params.choice = None
+        so_params.grammar = choice_grammar
+        return
+
+    if so_params.json:
+        if isinstance(so_params.json, str):
+            try:
+                schema = json.loads(so_params.json)
+            except json.JSONDecodeError as e:
+                raise ValueError("Invalid JSON grammar specification.") from e
+        else:
+            schema = so_params.json
+
+        try:
+            xgr.Grammar.from_json_schema(schema)
+        except Exception as err:
+            raise ValueError(
+                f"Failed to transform json schema into a grammar: {err}"
+            ) from err
+
+        if has_xgrammar_unsupported_json_features(schema):
+            raise ValueError(
+                "The provided JSON schema contains features not supported by xgrammar."
+            )
+        return
+
+    if so_params.grammar:
+        if grammar_is_likely_lark(so_params.grammar):
+            # xgrammar supports EBNF grammars only
+            try:
+                so_params.grammar = convert_lark_to_ebnf(so_params.grammar)
+            except ValueError as e:
+                raise ValueError(
+                    "Failed to convert the grammar from Lark to EBNF. "
+                ) from e
+
+        # Test parsing EBNF grammar, possibly already converted from Lark
+        try:
+            # parse the grammar, but we aren't compiling it.
+            xgr.Grammar.from_ebnf(so_params.grammar)
+        except Exception as e:
+            raise ValueError("Invalid grammar specification.") from e
+        return
+
+    if so_params.structural_tag:
+        try:
+            s_tag = json.loads(so_params.structural_tag)
+
+            # Using the deprecated method of compiling structural tag
+            if "structures" in s_tag:
+                tags = [
+                    xgr.StructuralTagItem(
+                        begin=s["begin"],
+                        schema=json.dumps(s["schema"]),
+                        end=s["end"],
+                    )
+                    for s in s_tag["structures"]
+                ]
+                xgr.Grammar.from_structural_tag(tags, s_tag["triggers"])
+            else:
+                xgr.Grammar.from_structural_tag(so_params.structural_tag)
+        except Exception as e:
+            raise ValueError("Invalid structural tag specification.") from e
--- a/vllm/v1/structured_output/request.py
+++ b/vllm/v1/structured_output/request.py
@@ -0,0 +1,94 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import dataclasses
+import functools
+import json
+from concurrent.futures import Future
+from concurrent.futures._base import TimeoutError
+from typing import cast
+
+from vllm.sampling_params import SamplingParams, StructuredOutputsParams
+from vllm.v1.structured_output.backend_types import (
+    StructuredOutputGrammar,
+    StructuredOutputKey,
+    StructuredOutputOptions,
+)
+
+
+@dataclasses.dataclass
+class StructuredOutputRequest:
+    params: StructuredOutputsParams
+    _grammar: Future[StructuredOutputGrammar] | StructuredOutputGrammar | None = None
+    reasoning_ended: bool | None = None
+
+    @staticmethod
+    def from_sampling_params(
+        sampling_params: SamplingParams | None,
+    ) -> "StructuredOutputRequest | None":
+        if sampling_params is None:
+            return None
+        params = sampling_params.structured_outputs
+        if params:
+            if params.all_constraints_none():
+                return None
+            else:
+                return StructuredOutputRequest(params=params)
+        return None
+
+    def _check_grammar_completion(self) -> bool:
+        # NOTE: We have to lazy import to gate circular imports
+        from vllm.v1.request import RequestStatus
+
+        if isinstance(self._grammar, Future):
+            try:
+                # We will check whether the future is ready within 100 us
+                self._grammar = self._grammar.result(timeout=0.0001)
+                self.status = RequestStatus.WAITING
+            except TimeoutError:
+                return False
+        return True
+
+    @property
+    def is_grammar_ready(self) -> bool:
+        return self._check_grammar_completion()
+
+    @property
+    def grammar(self) -> StructuredOutputGrammar | None:
+        completed = self._check_grammar_completion()
+        return (
+            cast(StructuredOutputGrammar | None, self._grammar) if completed else None
+        )
+
+    @grammar.setter
+    def grammar(
+        self, grammar: StructuredOutputGrammar | Future[StructuredOutputGrammar]
+    ) -> None:
+        self._grammar = grammar
+
+    @functools.cached_property
+    def structured_output_key(self) -> StructuredOutputKey:
+        return get_structured_output_key(self.params)
+
+
+def get_structured_output_key(params: StructuredOutputsParams) -> StructuredOutputKey:
+    if params.json is not None:
+        if not isinstance(params.json, str):
+            json_str = json.dumps(params.json)
+        else:
+            json_str = params.json
+        return StructuredOutputOptions.JSON, json_str
+    if params.json_object:
+        return StructuredOutputOptions.JSON_OBJECT, ""
+    if params.regex is not None:
+        return StructuredOutputOptions.REGEX, params.regex
+    if params.choice is not None:
+        if not isinstance(params.choice, str):
+            json_str = json.dumps(params.choice)
+        else:
+            json_str = params.choice
+        return StructuredOutputOptions.CHOICE, json_str
+    if params.grammar is not None:
+        return StructuredOutputOptions.GRAMMAR, params.grammar
+    if params.structural_tag is not None:
+        return StructuredOutputOptions.STRUCTURAL_TAG, params.structural_tag
+    raise ValueError("No valid structured output parameter found")
--- a/vllm/v1/structured_output/utils.py
+++ b/vllm/v1/structured_output/utils.py
@@ -0,0 +1,469 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import hashlib
+import importlib.metadata
+import os
+from typing import TYPE_CHECKING
+
+import numpy as np
+import regex as re
+import torch
+from cachetools import LRUCache
+from diskcache import Cache
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.utils.import_utils import LazyLoader
+from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
+
+if TYPE_CHECKING:
+    import outlines_core as oc
+    import transformers.file_utils as file_utils
+    import transformers.models.gpt2.tokenization_gpt2 as tokenization_gpt2
+    import xgrammar as xgr
+
+    from vllm.tokenizers import TokenizerLike
+    from vllm.v1.worker.gpu_input_batch import InputBatch
+else:
+    xgr = LazyLoader("xgr", globals(), "xgrammar")
+    oc = LazyLoader("oc", globals(), "outlines_core")
+    file_utils = LazyLoader("file_utils", globals(), "transformers.file_utils")
+    tokenization_gpt2 = LazyLoader(
+        "tokenization_gpt2",
+        globals(),
+        "transformers.models.gpt2.tokenization_gpt2",
+    )
+
+    TokenizerLike = object
+    SchedulerOutput = object
+    InputBatch = object
+
+logger = init_logger(__name__)
+
+CACHE = None
+
+
+def apply_grammar_bitmask(
+    scheduler_output: SchedulerOutput,
+    grammar_output: GrammarOutput,
+    input_batch: InputBatch,
+    logits: torch.Tensor,
+) -> None:
+    """
+    Apply grammar bitmask to output logits of the model with xgrammar function.
+
+    Args:
+        scheduler_output (SchedulerOutput): The result of engine scheduling.
+        input_batch (InputBatch): The input of model runner.
+        logits (torch.Tensor): The output logits of model forward.
+    """
+    # Serialization of np.ndarray is much more efficient than a tensor,
+    # so we receive it in that format.
+    grammar_bitmask = grammar_output.grammar_bitmask
+
+    # We receive the structured output bitmask from the scheduler,
+    # compacted to contain bitmasks only for structured output requests.
+    # The order of the requests in the bitmask is not guaranteed to be the
+    # same as the order of the requests in the gpu runner's batch. We need
+    # to sort the bitmask to match the order of the requests used here.
+
+    # Get the batch indices of the structured output requests.
+    # Keep track of the number of speculative tokens scheduled for every
+    # request in the batch, as the logit indices are offset by this amount.
+    struct_out_req_batch_indices: dict[str, int] = {}
+    cumulative_offset = 0
+    seq = sorted(input_batch.req_id_to_index.items(), key=lambda x: x[1])
+    for req_id, batch_index in seq:
+        logit_index = batch_index + cumulative_offset
+        cumulative_offset += len(
+            scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])
+        )
+        if req_id in grammar_output.structured_output_request_ids:
+            struct_out_req_batch_indices[req_id] = logit_index
+
+    out_indices = []
+
+    # Reorder the bitmask to match the order of the requests in the batch.
+    sorted_bitmask = np.full(
+        shape=(logits.shape[0], grammar_bitmask.shape[1]),
+        fill_value=-1,
+        dtype=grammar_bitmask.dtype,
+    )
+    cumulative_index = 0
+    for req_id in grammar_output.structured_output_request_ids:
+        num_spec_tokens = len(
+            scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])
+        )
+        if req_id in struct_out_req_batch_indices:
+            logit_index = struct_out_req_batch_indices[req_id]
+            for i in range(1 + num_spec_tokens):
+                sorted_bitmask[logit_index + i] = grammar_bitmask[cumulative_index + i]
+                out_indices.append(logit_index + i)
+        cumulative_index += 1 + num_spec_tokens
+
+    # Copy async to device as tensor.
+    grammar_bitmask = torch.from_numpy(sorted_bitmask).to(
+        logits.device, non_blocking=True
+    )
+
+    # If the length of out indices and the logits have the same shape
+    # we don't need to pass indices to the kernel,
+    # since the bitmask is already aligned with the logits.
+    skip_out_indices = len(out_indices) == logits.shape[0]
+
+    index_tensor = None
+    if not skip_out_indices:
+        # xgrammar expects a python list of indices but it will actually work with
+        # a tensor. If we copy the tensor ourselves here we can do it in a non_blocking
+        # manner and there should be no cpu sync within xgrammar.
+        index_tensor = torch.tensor(
+            out_indices, dtype=torch.int32, device="cpu", pin_memory=True
+        )
+        index_tensor = index_tensor.to(logits.device, non_blocking=True)
+
+    xgr.apply_token_bitmask_inplace(logits, grammar_bitmask, indices=index_tensor)
+
+
+class OutlinesVocabulary:
+    """
+    Wrapper class for `outlines_core.Vocabulary`,
+    which allows us to store a hash with the vocabulary
+    """
+
+    def __init__(self, vocabulary: oc.Vocabulary) -> None:
+        # Actual vocabulary object
+        self.inner = vocabulary
+        # Have to do abs(hash()) because python hashes can
+        # be negative, and we are using hash as a cache key.
+        hex_str = hashlib.sha256(vocabulary.__repr__().encode("utf-8")).hexdigest()
+        hash_int = int(hex_str, 16)
+        self._hash = hash_int
+
+
+def get_outlines_cache_path() -> str:
+    """Get the context object that contains previously-computed return values"""
+    outlines_cache_dir = os.getenv("OUTLINES_CACHE_DIR")
+    xdg_cache_home = os.getenv("XDG_CACHE_HOME")
+    home_dir = os.path.expanduser("~")
+
+    if outlines_cache_dir:
+        # OUTLINES_CACHE_DIR takes precedence
+        return outlines_cache_dir
+    elif xdg_cache_home:
+        return os.path.join(xdg_cache_home, ".cache", "outlines")
+    # If homedir is "/", we may be inside a container, and thus writing to
+    # root would be problematic, so we fall back to using a tempfile.
+    # Also validate the path exists, since os.path.expanduser does
+    # not guarantee existence.
+    elif os.path.isdir(home_dir) and home_dir != "/":
+        # Default Unix fallback: ~/.cache/outlines
+        return os.path.join(home_dir, ".cache", "outlines")
+    else:
+        import tempfile
+
+        # home_dir may be / inside a docker container without existing user
+        tempdir = tempfile.gettempdir()
+        return os.path.join(tempdir, ".cache", "outlines")
+
+
+def get_outlines_cache():
+    """Get the Cache instance to be used for index caching"""
+
+    cache_dir = get_outlines_cache_path()
+    if envs.VLLM_V1_USE_OUTLINES_CACHE:
+        logger.warning(
+            "Enabling outlines cache. This is an unbounded on-disk "
+            "cache. It may consume a lot of disk space and should "
+            "not be used with untrusted clients."
+        )
+        cache = Cache(cache_dir, eviction_policy="none", cull_limit=0)
+        outlines_version = importlib.metadata.version("outlines_core")
+
+        cached_version = cache.get("__version__", None)
+        if cached_version != outlines_version:
+            cache.clear()
+        cache.set("__version__", outlines_version)
+        return cache
+    else:
+        return LRUCache(maxsize=128)
+
+
+re_llama_byte_token = re.compile(r"^<0x[0-9A-F]{2}>$")
+re_replacement_seq = re.compile(r"^.{0,6}<7D>+.{0,6}$")
+
+
+def _reduced_vocabulary(
+    tokenizer: TokenizerLike,
+    eos_token_id: int,
+) -> dict[bytes, list[int]]:
+    """Create a map from vocabulary tokens to lists of equivalent token ids.
+
+    Returns:
+        A Dict of token string -> equivalent token ids
+    """
+
+    unicode_to_bytes = {v: k for k, v in tokenization_gpt2.bytes_to_unicode().items()}
+
+    def convert_token_to_string(token: str) -> str:
+        string = tokenizer.convert_tokens_to_string([token])
+
+        # A hack to handle missing spaces to HF's Llama tokenizers
+        if (
+            type(token) is str
+            and token.startswith(file_utils.SPIECE_UNDERLINE)
+            or token == "<0x20>"
+        ):
+            return " " + string
+
+        return string
+
+    vocabulary: dict[bytes, list[int]] = {}
+    empty_token_ids: list[int] = []
+    for token, token_idx in tokenizer.get_vocab().items():
+        if token in tokenizer.all_special_tokens:
+            continue
+
+        token_str = convert_token_to_string(token)
+        if token_str:
+            if isinstance(token, (bytes, bytearray)):
+                # For BPE tokenizers where tokens are stored as bytes.
+
+                # safe to ignore since token_str is of type (bytearray, bytes)
+                # by this point.
+                token_bytes = bytes(token_str)  # type: ignore[arg-type]
+
+            elif "\ufffd" in token_str and not re_replacement_seq.match(token_str):
+                # Handle tokens with invalid UTF-8 sequences.
+                if re_llama_byte_token.match(token):
+                    # Llama-like tokenizers use <0xXX> for incomplete sequences.
+                    token_bytes = bytes([int(token[3:5], 16)])
+                else:
+                    # GPT2 tokenizers: map each byte back using unicode_to_bytes
+                    byte_vals = [unicode_to_bytes.get(c) for c in token]
+                    if None in byte_vals:
+                        raise RuntimeError(
+                            f"Cannot convert token `{token}`"
+                            f" ({token_idx}) to bytes: {token_str}"
+                        )
+                    # safe to ignore, since if None in byte_vals,
+                    # an error is thrown.
+                    token_bytes = bytes(byte_vals)  # type: ignore[arg-type]
+            else:
+                token_bytes = token_str.encode("utf-8")
+
+            if token_idx != eos_token_id:
+                vocabulary.setdefault(token_bytes, []).append(token_idx)
+        else:
+            empty_token_ids.append(token_idx)
+
+    return vocabulary
+
+
+def get_outlines_vocabulary(tokenizer: TokenizerLike) -> oc.Vocabulary:
+    """Get the `Vocabulary` object for a given tokenizer."""
+    if hasattr(tokenizer, "_outlines_vocabulary"):
+        return tokenizer._outlines_vocabulary  # type: ignore
+
+    try:
+        if (
+            hasattr(
+                tokenizer,
+                "eos_token_id",
+            )
+            and tokenizer.eos_token_id is not None
+        ):
+            eos_token_id = tokenizer.eos_token_id
+        else:
+            raise ValueError(
+                f"Error during structured outputs setup for outlines: Tokenizer ({type(tokenizer)}) has no `eos_token_id` property, but `eos_token_id` is required for structured outputs to work properly."  # noqa: E501
+            )
+
+        reduced_vocab = _reduced_vocabulary(
+            tokenizer,
+            eos_token_id,  # type: ignore
+        )
+        vocabulary = OutlinesVocabulary(oc.Vocabulary(eos_token_id, reduced_vocab))
+        tokenizer._outlines_vocabulary = vocabulary  # type: ignore
+
+        return vocabulary
+    except AttributeError as e:
+        raise ValueError(
+            f"Cannot get the vocabulary of the tokenizer "
+            f"({type(tokenizer)}). The tokenizer should have a "
+            "get_vocab method."
+        ) from e
+
+
+def grammar_is_likely_lark(grammar_str: str) -> bool:
+    """
+    Check if grammar appears to use Lark syntax.
+
+    Args:
+        grammar_str: Input grammar string
+
+    Returns:
+        bool: True if grammar appears to be in Lark format, False otherwise
+
+    Examples:
+        >>> grammar_is_likely_lark("rule: 'abc'")
+        True
+        >>> grammar_is_likely_lark("rule ::= 'abc'")
+        False
+    """
+    if not grammar_str or not isinstance(grammar_str, str):
+        return False
+
+    for line in grammar_str.split("\n"):
+        # Remove both comment styles
+        line = re.sub(r"(#|//).*$", "", line).strip()
+        if not line:
+            continue
+
+        # Look for EBNF rule definition
+        if "::=" in line:
+            return False
+
+    return True
+
+
+def convert_lark_to_ebnf(grammar_str: str) -> str:
+    """
+    Convert a Lark grammar string to EBNF format.
+
+    EBNF reference:
+    https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md
+    Lark grammar reference:
+    https://lark-parser.readthedocs.io/en/latest/grammar.html
+
+    Args:
+        grammar_str: Input grammar in Lark format
+
+    Returns:
+        str: Converted grammar in EBNF format
+
+    Examples:
+        >>> print(convert_lark_to_ebnf("rule: 'hello'"))
+        root ::= rule
+        rule ::= "hello"
+    """
+    if not isinstance(grammar_str, str):
+        raise ValueError(f"Grammar must be a string, got {type(grammar_str)}")
+    if not grammar_str.strip():
+        raise ValueError("Grammar string cannot be empty")
+
+    defined_rules = set()
+    referenced_rules = set()
+    output_lines = []
+
+    def clean_line(line: str) -> str:
+        """Remove comments and whitespace from line."""
+        return re.sub(r"(#|//).*$", "", line).strip()
+
+    def check_quotes(text: str, rule_name: str, line_num: int) -> None:
+        """Validate quote matching in text."""
+        if text.count("'") % 2 != 0 or text.count('"') % 2 != 0:
+            raise ValueError(f"Mismatched quotes in {rule_name} on line {line_num}")
+
+    def extract_references(text: str) -> set[str]:
+        """Extract rule references from text."""
+        # Remove quoted strings and special characters
+        text = re.sub(r'"[^"]*"', "", text)
+        text = re.sub(r"[+*?()|\[\]{}]", " ", text)
+        return set(re.findall(r"\b[a-zA-Z_][a-zA-Z0-9_]*\b", text))
+
+    # First pass: Find root rule and validate rule definitions
+    lines = [clean_line(line) for line in grammar_str.split("\n")]
+    first_rule = None
+
+    for line_num, line in enumerate(lines, 1):
+        if not line or line.startswith("|"):
+            continue
+
+        if ":" in line:
+            try:
+                name = line.split(":", 1)[0].strip().strip("?")
+                defined_rules.add(name)
+                if first_rule is None:
+                    first_rule = name
+                if name == "start":
+                    first_rule = "start"
+            except IndexError as e:
+                raise ValueError(
+                    f"Invalid rule format on line {line_num}. "
+                    "Expected 'rule_name: definition'"
+                ) from e
+
+    if not defined_rules:
+        raise ValueError("No valid rules found in grammar")
+
+    # Add root rule
+    output_lines.append(f"root ::= {first_rule}")
+
+    # Second pass: Process rule definitions and alternatives
+    current_rule = None
+    current_definition = []
+
+    for line_num, line in enumerate(lines, 1):
+        if not line:
+            continue
+
+        try:
+            if ":" in line and not line.startswith("|"):
+                # Save previous rule if exists
+                if current_rule:
+                    output_lines.append(
+                        f"{current_rule} ::= {' | '.join(current_definition)}"
+                    )
+
+                # Process new rule
+                name, definition = line.split(":", 1)
+                current_rule = name.strip().strip("?")
+
+                check_quotes(definition, f"rule '{current_rule}'", line_num)
+                definition = re.sub(r"'([^']*)'", r'"\1"', definition)
+                referenced_rules.update(extract_references(definition))
+                current_definition = [definition.strip()]
+
+            elif line.startswith("|"):
+                if not current_rule:
+                    raise ValueError(
+                        f"Alternative '|' on line {line_num} "
+                        "without a preceding rule definition"
+                    )
+
+                alt_def = line[1:].strip()
+                check_quotes(
+                    alt_def, f"alternative for rule '{current_rule}'", line_num
+                )
+                alt_def = re.sub(r"'([^']*)'", r'"\1"', alt_def)
+                referenced_rules.update(extract_references(alt_def))
+                current_definition.append(alt_def)
+
+        except ValueError as e:
+            raise ValueError(f"Error on line {line_num}: {str(e)}") from e
+
+    # Add final rule if exists
+    if current_rule:
+        output_lines.append(f"{current_rule} ::= {' | '.join(current_definition)}")
+
+    # Validate all rules are defined
+    undefined_rules = referenced_rules - defined_rules - {"root"}
+    if undefined_rules:
+        raise ValueError(
+            f"Referenced rules are not defined: {', '.join(sorted(undefined_rules))}"
+        )
+
+    return "\n".join(output_lines)
+
+
+def choice_as_grammar(choice: list[str]) -> str:
+    def escape_ebnf_string(s: str) -> str:
+        """Escape special characters in a EBNF string."""
+        # Escape double quotes and backslashes
+        return re.sub(r'(["\\])', r"\\\1", s)
+
+    escaped_choices = (escape_ebnf_string(c) for c in choice)
+    grammar = "root ::= " + " | ".join(f'"{c}"' for c in escaped_choices)
+    return grammar