init
This commit is contained in:
181
model_executor/guided_decoding/__init__.py
Normal file
181
model_executor/guided_decoding/__init__.py
Normal file
@@ -0,0 +1,181 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.guided_decoding.utils import (
|
||||
convert_lark_to_gbnf, grammar_is_likely_lark,
|
||||
has_lmf_unsupported_json_features, has_xgrammar_unsupported_json_features)
|
||||
from vllm.reasoning import ReasoningParserManager
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from transformers import PreTrainedTokenizer
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.logits_process import LogitsProcessor
|
||||
from vllm.sampling_params import GuidedDecodingParams
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def maybe_backend_fallback(
|
||||
guided_params: GuidedDecodingParams) -> GuidedDecodingParams:
|
||||
|
||||
def fallback_or_error(guided_params: GuidedDecodingParams, message: str,
|
||||
fallback: str) -> None:
|
||||
"""Change the backend to the specified fallback with a warning log,
|
||||
or raise a ValueError if the `disable_fallback` option is specified."""
|
||||
if guided_params.disable_fallback:
|
||||
raise ValueError(message)
|
||||
|
||||
logger.warning("%s Falling back to use %s instead.", message, fallback)
|
||||
guided_params.backend = fallback
|
||||
|
||||
# `auto` was added for V1 to explicitly declare a mode that has fallbacks
|
||||
# in place. If that is specified with V0, treat it as `xgrammar`, as we have
|
||||
# fallbacks enabled for that and it is the V0 default.
|
||||
if guided_params.backend == "auto":
|
||||
guided_params.backend = "xgrammar"
|
||||
|
||||
# lm-format-enforce doesn't support grammar, fallback to xgrammar
|
||||
if guided_params.backend == "lm-format-enforcer":
|
||||
if guided_params.grammar is not None:
|
||||
fallback_or_error(
|
||||
guided_params,
|
||||
"lm-format-enforcer does not support grammar guided decoding.",
|
||||
"xgrammar")
|
||||
|
||||
# lm-format-enforcer doesn't support some JSON schema features
|
||||
elif (guided_params.json is not None
|
||||
and has_lmf_unsupported_json_features(guided_params.json)):
|
||||
fallback_or_error(
|
||||
guided_params,
|
||||
"lm-format-enforcer does not support advanced JSON schema "
|
||||
"features like patterns or numeric ranges.", "outlines")
|
||||
|
||||
if guided_params.backend == "xgrammar":
|
||||
from vllm.model_executor.guided_decoding.xgrammar_decoding import (
|
||||
xgr_installed)
|
||||
|
||||
# xgrammar doesn't support some JSON schema features
|
||||
if (guided_params.json is not None and
|
||||
has_xgrammar_unsupported_json_features(guided_params.json)):
|
||||
fallback_or_error(
|
||||
guided_params,
|
||||
"xgrammar does not support advanced JSON schema features like "
|
||||
"string length, item limits, or property bounds.", "outlines")
|
||||
|
||||
# xgrammar only supports GBNF grammars, so we must convert Lark.
|
||||
# We must check if the grammar is likely Lark and if that
|
||||
# grammar is convertible to GBNF
|
||||
elif (guided_params.grammar is not None
|
||||
and grammar_is_likely_lark(guided_params.grammar)):
|
||||
try:
|
||||
convert_lark_to_gbnf(guided_params.grammar)
|
||||
except Exception:
|
||||
fallback_or_error(
|
||||
guided_params,
|
||||
"xgrammar does not support Lark grammars and the "
|
||||
"grammar failed to convert to GBNF.", "outlines")
|
||||
|
||||
# If the xgrammar module cannot be imported successfully,
|
||||
# we should still allow users to use guided decoding with a fallback.
|
||||
elif not xgr_installed:
|
||||
fallback_or_error(
|
||||
guided_params,
|
||||
"xgrammar module cannot be imported successfully.", "outlines")
|
||||
|
||||
if (guided_params.backend == "outlines"
|
||||
and guided_params.json_object is not None):
|
||||
# outlines doesn't support json_object, fallback to guidance
|
||||
fallback_or_error(guided_params,
|
||||
"outlines does not support json_object.", "guidance")
|
||||
|
||||
return guided_params
|
||||
|
||||
|
||||
async def get_guided_decoding_logits_processor(
|
||||
guided_params: GuidedDecodingParams,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
model_config: ModelConfig,
|
||||
reasoning_backend: str | None = None) -> LogitsProcessor | None:
|
||||
|
||||
reasoner = None
|
||||
if reasoning_backend:
|
||||
reasoner_class = ReasoningParserManager.get_reasoning_parser(
|
||||
reasoning_backend)
|
||||
reasoner = reasoner_class(tokenizer)
|
||||
|
||||
guided_params = maybe_backend_fallback(guided_params)
|
||||
|
||||
# CFG grammar not supported by LMFE, so we use outlines instead
|
||||
if guided_params.backend == 'outlines':
|
||||
# NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
|
||||
from vllm.model_executor.guided_decoding.outlines_decoding import ( # noqa
|
||||
get_outlines_guided_decoding_logits_processor)
|
||||
return await get_outlines_guided_decoding_logits_processor(
|
||||
guided_params, tokenizer, reasoner)
|
||||
if guided_params.backend == 'lm-format-enforcer':
|
||||
from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import ( # noqa
|
||||
get_local_lm_format_enforcer_guided_decoding_logits_processor)
|
||||
return get_local_lm_format_enforcer_guided_decoding_logits_processor(
|
||||
guided_params, tokenizer)
|
||||
if guided_params.backend == 'xgrammar':
|
||||
from vllm.model_executor.guided_decoding.xgrammar_decoding import ( # noqa
|
||||
get_local_xgrammar_guided_decoding_logits_processor)
|
||||
return get_local_xgrammar_guided_decoding_logits_processor(
|
||||
guided_params, tokenizer, model_config, reasoner)
|
||||
if guided_params.backend == 'guidance':
|
||||
from vllm.model_executor.guided_decoding.guidance_decoding import (
|
||||
get_local_guidance_guided_decoding_logits_processor)
|
||||
return get_local_guidance_guided_decoding_logits_processor(
|
||||
guided_params, tokenizer)
|
||||
raise ValueError(
|
||||
f"Unknown guided decoding backend '{guided_params.backend}'. "
|
||||
"Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar', 'guidance'"
|
||||
)
|
||||
|
||||
|
||||
def get_local_guided_decoding_logits_processor(
|
||||
guided_params: GuidedDecodingParams,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
model_config: ModelConfig,
|
||||
reasoning_backend: str | None = None) -> LogitsProcessor | None:
|
||||
guided_params = maybe_backend_fallback(guided_params)
|
||||
|
||||
reasoner = None
|
||||
if reasoning_backend:
|
||||
reasoner_class = ReasoningParserManager.get_reasoning_parser(
|
||||
reasoning_backend)
|
||||
reasoner = reasoner_class(tokenizer)
|
||||
|
||||
# CFG grammar not supported by LMFE, so we use outlines instead
|
||||
if guided_params.backend == 'outlines':
|
||||
# NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
|
||||
from vllm.model_executor.guided_decoding.outlines_decoding import ( # noqa
|
||||
get_local_outlines_guided_decoding_logits_processor)
|
||||
return get_local_outlines_guided_decoding_logits_processor(
|
||||
guided_params, tokenizer, reasoner)
|
||||
if guided_params.backend == 'lm-format-enforcer':
|
||||
from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import ( # noqa
|
||||
get_local_lm_format_enforcer_guided_decoding_logits_processor)
|
||||
return get_local_lm_format_enforcer_guided_decoding_logits_processor(
|
||||
guided_params, tokenizer)
|
||||
if guided_params.backend == 'xgrammar':
|
||||
from vllm.model_executor.guided_decoding.xgrammar_decoding import ( # noqa
|
||||
get_local_xgrammar_guided_decoding_logits_processor)
|
||||
return get_local_xgrammar_guided_decoding_logits_processor(
|
||||
guided_params, tokenizer, model_config, reasoner)
|
||||
if guided_params.backend == 'guidance':
|
||||
from vllm.model_executor.guided_decoding.guidance_decoding import (
|
||||
get_local_guidance_guided_decoding_logits_processor)
|
||||
return get_local_guidance_guided_decoding_logits_processor(
|
||||
guided_params, tokenizer)
|
||||
|
||||
raise ValueError(
|
||||
f"Unknown guided decoding backend '{guided_params.backend}'. "
|
||||
"Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar', 'guidance'"
|
||||
)
|
||||
63
model_executor/guided_decoding/guidance_decoding.py
Normal file
63
model_executor/guided_decoding/guidance_decoding.py
Normal file
@@ -0,0 +1,63 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import json
|
||||
|
||||
import llguidance
|
||||
from regex import escape as regex_escape
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
from vllm.model_executor.guided_decoding.guidance_logits_processors import (
|
||||
GuidanceLogitsProcessor)
|
||||
from vllm.sampling_params import GuidedDecodingParams
|
||||
from vllm.v1.structured_output.backend_guidance import (
|
||||
process_for_additional_properties)
|
||||
|
||||
|
||||
def get_local_guidance_guided_decoding_logits_processor(
|
||||
guided_params: GuidedDecodingParams,
|
||||
tokenizer: PreTrainedTokenizerBase) -> GuidanceLogitsProcessor:
|
||||
"""
|
||||
Given an OpenAI-compatible request, check for guided decoding parameters
|
||||
and get the necessary logits processor for the given guide.
|
||||
"""
|
||||
|
||||
grm = ""
|
||||
any_whitespace = not guided_params.disable_any_whitespace
|
||||
if (guide_json := guided_params.json) is not None:
|
||||
# Optionally set additionalProperties to False at the top-level
|
||||
# By default, other backends do not allow additional top-level
|
||||
# properties, so this makes guidance more similar to other backends
|
||||
if guided_params.disable_additional_properties:
|
||||
if not isinstance(guide_json, str):
|
||||
guide_json = json.dumps(guide_json)
|
||||
guide_json = process_for_additional_properties(guide_json)
|
||||
|
||||
grm = llguidance.LLMatcher.grammar_from_json_schema(
|
||||
guide_json,
|
||||
overrides={"whitespace_pattern": guided_params.whitespace_pattern},
|
||||
defaults={
|
||||
"whitespace_flexible": any_whitespace,
|
||||
})
|
||||
elif guided_params.json_object:
|
||||
grm = llguidance.LLMatcher.grammar_from_json_schema(
|
||||
'{"type": "object"}',
|
||||
overrides={"whitespace_pattern": guided_params.whitespace_pattern},
|
||||
defaults={
|
||||
"whitespace_flexible": any_whitespace,
|
||||
})
|
||||
elif guided_params.regex:
|
||||
grm = llguidance.grammar_from("regex", guided_params.regex)
|
||||
elif guided_params.choice:
|
||||
# choice just uses regex
|
||||
choices = (regex_escape(str(choice))
|
||||
for choice in guided_params.choice)
|
||||
choices_regex = "(" + "|".join(choices) + ")"
|
||||
grm = llguidance.grammar_from("regex", choices_regex)
|
||||
elif guided_params.grammar:
|
||||
# this supports Lark and GBNF
|
||||
grm = llguidance.grammar_from("grammar", guided_params.grammar)
|
||||
|
||||
if grm:
|
||||
return GuidanceLogitsProcessor(grm, tokenizer)
|
||||
|
||||
raise ValueError("Unknown guided decoding mode")
|
||||
104
model_executor/guided_decoding/guidance_logits_processors.py
Normal file
104
model_executor/guided_decoding/guidance_logits_processors.py
Normal file
@@ -0,0 +1,104 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import copy
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
import llguidance
|
||||
import llguidance.hf
|
||||
import llguidance.torch
|
||||
import torch
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class GuidanceLogitsProcessor:
|
||||
"""Base Guidance Logits Processor"""
|
||||
|
||||
cached_tokenizers: dict[str, Any] = {}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
grammar: str,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
) -> None:
|
||||
"""Base Guidance Logits Processor
|
||||
|
||||
Args:
|
||||
grammar (str)
|
||||
grammar to guide the generation
|
||||
tokenizer (PreTrainedTokenizerBase)
|
||||
model's tokenizer
|
||||
"""
|
||||
self.grammar = grammar
|
||||
self.tokenizer = tokenizer
|
||||
self.tokenizer_name = tokenizer.name_or_path
|
||||
self.ll_tokenizer = None
|
||||
self.ll_matcher = None
|
||||
self.bitmask = None
|
||||
self.new_sampling = False
|
||||
self.initialized = False
|
||||
|
||||
def clone(self) -> "GuidanceLogitsProcessor":
|
||||
cloned = copy.copy(self)
|
||||
if self.initialized:
|
||||
cloned.ll_matcher = llguidance.LLMatcher(
|
||||
self.ll_tokenizer, # type: ignore[assignment]
|
||||
self.grammar,
|
||||
log_level=int(os.environ.get("LLGUIDANCE_LOG_LEVEL", "1")),
|
||||
)
|
||||
self.bitmask = llguidance.torch.allocate_token_bitmask(
|
||||
1, self.ll_tokenizer.vocab_size) # type: ignore[attr-defined]
|
||||
return cloned
|
||||
|
||||
def _initialize(self):
|
||||
if self.initialized:
|
||||
return
|
||||
|
||||
ll_tokenizer = self.cached_tokenizers.get(self.tokenizer.name_or_path,
|
||||
None)
|
||||
if ll_tokenizer is None:
|
||||
ll_tokenizer = llguidance.hf.from_tokenizer(self.tokenizer, None)
|
||||
self.cached_tokenizers[self.tokenizer.name_or_path] = ll_tokenizer
|
||||
|
||||
self.ll_tokenizer = ll_tokenizer
|
||||
self.ll_matcher = llguidance.LLMatcher(
|
||||
self.ll_tokenizer,
|
||||
self.grammar,
|
||||
log_level=int(os.environ.get("LLGUIDANCE_LOG_LEVEL", "1")),
|
||||
)
|
||||
|
||||
# create reusable bitmask
|
||||
self.bitmask = llguidance.torch.allocate_token_bitmask(
|
||||
1, self.ll_tokenizer.vocab_size) # type: ignore[attr-defined]
|
||||
|
||||
self.initialized = True
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
input_ids: list[int],
|
||||
scores: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
# we initialize the guidance model here
|
||||
# to avoid pickling ll_tokenizer and ll_interpreter
|
||||
self._initialize()
|
||||
|
||||
if self.new_sampling and len(input_ids) > 0:
|
||||
self.ll_matcher.consume_token( # type: ignore[attr-defined]
|
||||
input_ids[-1])
|
||||
err = self.ll_matcher.get_error() # type: ignore[attr-defined]
|
||||
if err:
|
||||
logger.warning("Error in LLMatcher: %s", err)
|
||||
|
||||
llguidance.torch.fill_next_token_bitmask(self.ll_matcher, self.bitmask,
|
||||
0)
|
||||
llguidance.torch.apply_token_bitmask_inplace(
|
||||
scores,
|
||||
self.bitmask.to(scores.device)) # type: ignore[attr-defined]
|
||||
|
||||
self.new_sampling = True
|
||||
|
||||
return scores
|
||||
41
model_executor/guided_decoding/guided_fields.py
Normal file
41
model_executor/guided_decoding/guided_fields.py
Normal file
@@ -0,0 +1,41 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, TypedDict, Union
|
||||
|
||||
|
||||
# These classes are deprecated, see SamplingParams
|
||||
class LLMGuidedOptions(TypedDict, total=False):
|
||||
guided_json: Union[dict, str]
|
||||
guided_regex: str
|
||||
guided_choice: list[str]
|
||||
guided_grammar: str
|
||||
guided_decoding_backend: str
|
||||
guided_whitespace_pattern: str
|
||||
guided_json_object: bool
|
||||
|
||||
|
||||
@dataclass
|
||||
class GuidedDecodingRequest:
|
||||
"""One of the fields will be used to retrieve the logit processor."""
|
||||
guided_json: Optional[Union[dict, str]] = None
|
||||
guided_regex: Optional[str] = None
|
||||
guided_choice: Optional[list[str]] = None
|
||||
guided_grammar: Optional[str] = None
|
||||
guided_decoding_backend: Optional[str] = None
|
||||
guided_whitespace_pattern: Optional[str] = None
|
||||
guided_json_object: Optional[bool] = None
|
||||
structural_tag: Optional[str] = None
|
||||
|
||||
def __post_init__(self):
|
||||
"""Validate that some fields are mutually exclusive."""
|
||||
guide_count = sum(x is not None
|
||||
for x in (self.guided_json, self.guided_regex,
|
||||
self.guided_choice, self.guided_grammar,
|
||||
self.guided_json_object,
|
||||
self.structural_tag))
|
||||
if guide_count > 1:
|
||||
raise ValueError(
|
||||
"You can only use one kind of guided decoding but multiple are "
|
||||
f"specified: {self.__dict__}")
|
||||
@@ -0,0 +1,67 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from functools import lru_cache
|
||||
from json import loads as json_loads
|
||||
from typing import Optional, Union
|
||||
|
||||
from lmformatenforcer import (CharacterLevelParser, JsonSchemaParser,
|
||||
RegexParser, StringParser,
|
||||
TokenEnforcerTokenizerData, UnionParser)
|
||||
from lmformatenforcer.integrations.vllm import (
|
||||
build_vllm_logits_processor, build_vllm_token_enforcer_tokenizer_data)
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
from vllm.logits_process import LogitsProcessor
|
||||
from vllm.sampling_params import GuidedDecodingParams
|
||||
|
||||
|
||||
def get_local_lm_format_enforcer_guided_decoding_logits_processor(
|
||||
guided_params: GuidedDecodingParams,
|
||||
tokenizer) -> Optional[LogitsProcessor]:
|
||||
"""
|
||||
Given an OpenAI-compatible request, check for guided decoding parameters
|
||||
and get the necessary logits processor for the given guide.
|
||||
We cache logit processors by (guide, tokenizer), and on cache hit
|
||||
we make a shallow copy to reuse the same underlying FSM.
|
||||
"""
|
||||
|
||||
tokenizer_data = _cached_build_vllm_token_enforcer_tokenizer_data(
|
||||
tokenizer)
|
||||
character_level_parser: CharacterLevelParser
|
||||
if guided_params.json:
|
||||
schema_dict = _normalize_json_schema_object(guided_params.json)
|
||||
character_level_parser = JsonSchemaParser(schema_dict)
|
||||
elif guided_params.choice:
|
||||
character_level_parser = UnionParser(
|
||||
[StringParser(choice) for choice in guided_params.choice])
|
||||
elif guided_params.regex:
|
||||
character_level_parser = RegexParser(guided_params.regex)
|
||||
elif guided_params.grammar:
|
||||
# CFG grammar not supported by LMFE
|
||||
raise ValueError("Cannot construct a guided decoding logits processor"
|
||||
" using the grammar option with the"
|
||||
" lm_format_enforcer backend.")
|
||||
elif guided_params.json_object:
|
||||
# None means any json object
|
||||
character_level_parser = JsonSchemaParser(None)
|
||||
else:
|
||||
return None
|
||||
|
||||
logits_processor = build_vllm_logits_processor(tokenizer_data,
|
||||
character_level_parser)
|
||||
return logits_processor
|
||||
|
||||
|
||||
def _normalize_json_schema_object(schema: Union[str, dict]) -> dict:
|
||||
if isinstance(schema, str):
|
||||
return json_loads(schema)
|
||||
if isinstance(schema, dict):
|
||||
return schema
|
||||
raise AssertionError(f"Unsupported schema type {schema}")
|
||||
|
||||
|
||||
@lru_cache
|
||||
def _cached_build_vllm_token_enforcer_tokenizer_data(
|
||||
tokenizer: PreTrainedTokenizerBase) -> TokenEnforcerTokenizerData:
|
||||
return build_vllm_token_enforcer_tokenizer_data(tokenizer)
|
||||
155
model_executor/guided_decoding/outlines_decoding.py
Normal file
155
model_executor/guided_decoding/outlines_decoding.py
Normal file
@@ -0,0 +1,155 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import asyncio
|
||||
import concurrent.futures
|
||||
import os
|
||||
from enum import Enum
|
||||
from json import dumps as json_dumps
|
||||
from typing import Optional, Union
|
||||
|
||||
from regex import escape as regex_escape
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
from vllm.model_executor.guided_decoding.outlines_logits_processors import (
|
||||
CFGLogitsProcessor, JSONLogitsProcessor, RegexLogitsProcessor)
|
||||
from vllm.reasoning import ReasoningParser
|
||||
from vllm.sampling_params import GuidedDecodingParams
|
||||
|
||||
|
||||
class GuidedDecodingMode(Enum):
|
||||
JSON = "json"
|
||||
REGEX = "regex"
|
||||
CHOICE = "choice"
|
||||
GRAMMAR = "grammar"
|
||||
|
||||
|
||||
# https://github.com/outlines-dev/outlines/blob/main/outlines/grammars/json.lark
|
||||
# the main difference is that we changed the start: value to
|
||||
# start: object | array, so we are denying scalar values as the root of the
|
||||
# JSON. Starting with scalars as the root seems to cause llama to generate
|
||||
# without stop.
|
||||
JSON_GRAMMAR = r"""
|
||||
?start: object | array
|
||||
|
||||
?value: object
|
||||
| array
|
||||
| UNESCAPED_STRING
|
||||
| SIGNED_NUMBER -> number
|
||||
| "true" -> true
|
||||
| "false" -> false
|
||||
| "null" -> null
|
||||
|
||||
array : "[" [value ("," value)*] "]"
|
||||
object : "{" [pair ("," pair)*] "}"
|
||||
pair : UNESCAPED_STRING ":" value
|
||||
|
||||
%import common.UNESCAPED_STRING
|
||||
%import common.SIGNED_NUMBER
|
||||
%import common.WS
|
||||
|
||||
%ignore WS
|
||||
"""
|
||||
|
||||
global_thread_pool = None # used for generating logits processor fsm
|
||||
|
||||
# It's not yet clear that using more provides a benefit, and it could
|
||||
# potentially starve other processes on the machine. We'll cap this for now and
|
||||
# adjust later if testing proves it to help overcome a bottleneck.
|
||||
_MAX_THREADPOOL_WORKERS = 16
|
||||
|
||||
|
||||
async def get_outlines_guided_decoding_logits_processor(
|
||||
guided_params: GuidedDecodingParams,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
reasoner: Optional[ReasoningParser],
|
||||
) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor,
|
||||
None]:
|
||||
"""
|
||||
Given an OpenAI-compatible request, check for guided decoding parameters
|
||||
and get the necessary logits processor for the given guide.
|
||||
We cache logit processors by (guide, tokenizer), and on cache hit
|
||||
we make a shallow copy to reuse the same underlying FSM.
|
||||
"""
|
||||
global global_thread_pool
|
||||
guide, mode = _get_guide_and_mode(guided_params)
|
||||
if not guide or not mode:
|
||||
return None
|
||||
|
||||
if global_thread_pool is None:
|
||||
max_workers = os.cpu_count() or 2
|
||||
if max_workers > _MAX_THREADPOOL_WORKERS:
|
||||
max_workers = _MAX_THREADPOOL_WORKERS
|
||||
global_thread_pool = concurrent.futures.ThreadPoolExecutor(
|
||||
max_workers=max_workers)
|
||||
loop = asyncio.get_running_loop()
|
||||
|
||||
return await loop.run_in_executor(global_thread_pool,
|
||||
_get_logits_processor, guide, tokenizer,
|
||||
mode, guided_params.whitespace_pattern,
|
||||
reasoner)
|
||||
|
||||
|
||||
def get_local_outlines_guided_decoding_logits_processor(
|
||||
guided_params: GuidedDecodingParams,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
reasoner: Optional[ReasoningParser],
|
||||
) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor,
|
||||
None]:
|
||||
"""
|
||||
Given an OpenAI-compatible request, check for guided decoding parameters
|
||||
and get the necessary logits processor for the given guide.
|
||||
We cache logit processors by (guide, tokenizer), and on cache hit
|
||||
we make a shallow copy to reuse the same underlying FSM.
|
||||
"""
|
||||
guide, mode = _get_guide_and_mode(guided_params)
|
||||
if not guide or not mode:
|
||||
return None
|
||||
|
||||
return _get_logits_processor(guide, tokenizer, mode,
|
||||
guided_params.whitespace_pattern, reasoner)
|
||||
|
||||
|
||||
def _get_guide_and_mode(
|
||||
guided_params: GuidedDecodingParams
|
||||
) -> Union[tuple[str, GuidedDecodingMode], tuple[None, None]]:
|
||||
if guided_params.json:
|
||||
if isinstance(guided_params.json, dict):
|
||||
# turn dict into hashable string
|
||||
json = json_dumps(guided_params.json)
|
||||
else:
|
||||
json = guided_params.json
|
||||
return json, GuidedDecodingMode.JSON
|
||||
elif guided_params.regex:
|
||||
return guided_params.regex, GuidedDecodingMode.REGEX
|
||||
elif guided_params.choice:
|
||||
# choice just uses regex
|
||||
choices = [
|
||||
regex_escape(str(choice)) for choice in guided_params.choice
|
||||
]
|
||||
choices_regex = "(" + "|".join(choices) + ")"
|
||||
return choices_regex, GuidedDecodingMode.CHOICE
|
||||
elif guided_params.grammar:
|
||||
return guided_params.grammar, GuidedDecodingMode.GRAMMAR
|
||||
elif guided_params.json_object:
|
||||
return JSON_GRAMMAR, GuidedDecodingMode.GRAMMAR
|
||||
else:
|
||||
return None, None
|
||||
|
||||
|
||||
def _get_logits_processor(
|
||||
guide: str,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
mode: GuidedDecodingMode,
|
||||
whitespace_pattern: Union[str, None],
|
||||
reasoner: Optional[ReasoningParser],
|
||||
) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor]:
|
||||
if mode == GuidedDecodingMode.JSON:
|
||||
return JSONLogitsProcessor(guide, tokenizer, whitespace_pattern,
|
||||
reasoner)
|
||||
elif mode == GuidedDecodingMode.REGEX or mode == GuidedDecodingMode.CHOICE:
|
||||
return RegexLogitsProcessor(guide, tokenizer, reasoner)
|
||||
elif mode == GuidedDecodingMode.GRAMMAR:
|
||||
return CFGLogitsProcessor(guide, tokenizer, reasoner)
|
||||
else:
|
||||
raise ValueError(f"Unknown guided decoding mode {mode}")
|
||||
284
model_executor/guided_decoding/outlines_logits_processors.py
Normal file
284
model_executor/guided_decoding/outlines_logits_processors.py
Normal file
@@ -0,0 +1,284 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Copyright 2024- the Outlines developers
|
||||
# This file is adapted from
|
||||
# https://github.com/outlines-dev/outlines/blob/main/outlines/serve/vllm.py
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import copy
|
||||
import json
|
||||
from collections import defaultdict
|
||||
from functools import lru_cache
|
||||
from typing import Callable, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from outlines import grammars
|
||||
from outlines.caching import cache, disable_cache
|
||||
from outlines.fsm.guide import (CFGGuide, CFGState, Generate, Guide,
|
||||
RegexGuide, Write)
|
||||
from outlines.fsm.parsing import PartialLark
|
||||
from outlines_core.fsm.json_schema import build_regex_from_schema
|
||||
from pydantic import BaseModel
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.reasoning import ReasoningParser
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
if envs.VLLM_V0_USE_OUTLINES_CACHE:
|
||||
logger.warning("Enabling outlines cache. This is an unbounded on-disk "
|
||||
"cache. It may consume a lot of disk space and should "
|
||||
"not be used with untrusted clients.")
|
||||
else:
|
||||
disable_cache()
|
||||
|
||||
|
||||
class BaseLogitsProcessor:
|
||||
|
||||
def __init__(self, guide: Guide, reasoner: Optional[ReasoningParser]):
|
||||
self._guide: Guide = guide
|
||||
self._reasoner: Optional[ReasoningParser] = reasoner
|
||||
# CFGState is used for the FSM state for CFGGuide
|
||||
self._fsm_state: defaultdict[int, Union[int,
|
||||
CFGState]] = defaultdict(int)
|
||||
|
||||
def clone(self) -> "BaseLogitsProcessor":
|
||||
cloned = copy.copy(self)
|
||||
cloned._guide = self._guide.copy()
|
||||
cloned._fsm_state = copy.deepcopy(self._fsm_state)
|
||||
return cloned
|
||||
|
||||
def __call__(self, input_ids: list[int],
|
||||
scores: torch.Tensor) -> torch.Tensor:
|
||||
"""Use the FSM to bias the logits before sampling the next token."""
|
||||
|
||||
# Skip the structured logits processing if reasoning is not finished.
|
||||
# reasoner is not None only when `--reasoning-parser` is set.
|
||||
if self._reasoner is not None:
|
||||
if not self._reasoner.is_reasoning_end(input_ids):
|
||||
return scores
|
||||
else:
|
||||
# Remove the reasoning tokens from the input_ids
|
||||
# We need this because our implementation relies on the
|
||||
# hash of the input_ids to store the FSM state.
|
||||
input_ids = self._reasoner.extract_content_ids(input_ids)
|
||||
|
||||
seq_id = hash(tuple(input_ids))
|
||||
|
||||
if len(input_ids) > 0:
|
||||
last_token = input_ids[-1]
|
||||
last_seq_id = hash(tuple(input_ids[:-1]))
|
||||
self._fsm_state[seq_id] = self._guide.get_next_state(
|
||||
state=self._fsm_state[last_seq_id], token_id=last_token)
|
||||
else:
|
||||
# Note: this is a hack.
|
||||
# Lark pickling does not work properly (silent failure),
|
||||
# which breaks the RPC (which uses python pickleing).
|
||||
# We need to find a better solution.
|
||||
# On the first time this is called, we simply re-create
|
||||
# the Lark object.
|
||||
if isinstance(self._guide, CFGGuide):
|
||||
self._guide.parser = PartialLark(
|
||||
self._guide.cfg_string,
|
||||
parser="lalr",
|
||||
import_paths=[grammars.GRAMMAR_PATH],
|
||||
)
|
||||
self._fsm_state[seq_id] = CFGState(
|
||||
parser_state=self._guide.parser.parse(""), prev_token=None)
|
||||
|
||||
instruction = self._guide.get_next_instruction(
|
||||
state=self._fsm_state[seq_id])
|
||||
|
||||
if type(instruction) == Generate: # noqa: E721
|
||||
allowed_tokens = instruction.tokens
|
||||
elif type(instruction) == Write: # noqa: E721
|
||||
# TODO: support fast forward tokens
|
||||
allowed_tokens = [instruction.tokens[0]]
|
||||
else:
|
||||
raise TypeError(
|
||||
f"Unsupported instruction type {type(instruction)}")
|
||||
|
||||
mask = torch.full((scores.shape[-1], ),
|
||||
-torch.inf,
|
||||
device=scores.device)
|
||||
# The tokenizer may support more token ids than the model can generate,
|
||||
# eg. Llama 3.2 Vision models have an `<|image|>` token with id 128256
|
||||
# but scores.shape == torch.Size([128256])
|
||||
# Using NumPy is faster for filtering token ids
|
||||
allowed_tokens = np.array(allowed_tokens, dtype=np.int64)
|
||||
allowed_tokens = torch.tensor(allowed_tokens, device=scores.device)
|
||||
allowed_tokens = allowed_tokens.masked_select(
|
||||
allowed_tokens < scores.shape[-1])
|
||||
mask.index_fill_(0, allowed_tokens, 0)
|
||||
if current_platform.is_hpu():
|
||||
# Workaround for HPU bug where add_() raise RuntimeError:
|
||||
# synNodeCreateWithId failed for node: strided_insert
|
||||
# with synStatus 1 [Invalid argument], hopefully it will
|
||||
# be fixed in the future releases of the HPU runtime.
|
||||
scores = scores.add(mask)
|
||||
else:
|
||||
scores.add_(mask)
|
||||
return scores
|
||||
|
||||
|
||||
class RegexLogitsProcessor(BaseLogitsProcessor):
|
||||
|
||||
@classmethod
|
||||
@cache()
|
||||
def _get_guide(cls, regex_string: str,
|
||||
tokenizer: PreTrainedTokenizerBase) -> Guide:
|
||||
tokenizer = _adapt_tokenizer(tokenizer)
|
||||
return RegexGuide.from_regex(regex_string, tokenizer)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
regex_string: str,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
reasoner: Optional[ReasoningParser],
|
||||
):
|
||||
"""Compile the FSM that drives the regex-structured generation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
regex_string
|
||||
A string that represents a regular expression
|
||||
tokenizer
|
||||
The model's tokenizer
|
||||
|
||||
"""
|
||||
super().__init__(
|
||||
RegexLogitsProcessor._get_guide(regex_string, tokenizer), reasoner)
|
||||
|
||||
|
||||
class JSONLogitsProcessor(RegexLogitsProcessor):
|
||||
|
||||
def __init__(self, schema: Union[str, dict, BaseModel],
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
whitespace_pattern: Union[str, None],
|
||||
reasoner: Optional[ReasoningParser]):
|
||||
"""Compile the FSM that drives the JSON-guided generation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
schema
|
||||
A JSON schema that encodes the structure we want the model to
|
||||
generate
|
||||
tokenizer
|
||||
The model's tokenizer
|
||||
whitespace_pattern
|
||||
Pattern to use for JSON syntactic whitespace (doesn't impact
|
||||
string literals)
|
||||
Example: allow only a single space or newline with
|
||||
`whitespace_pattern=r"[\n ]?"`
|
||||
"""
|
||||
if isinstance(schema, type(BaseModel)):
|
||||
schema_str = json.dumps(schema.model_json_schema())
|
||||
elif isinstance(schema, dict):
|
||||
schema_str = json.dumps(schema)
|
||||
elif isinstance(schema, str):
|
||||
schema_str = schema
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Cannot parse schema {schema}. The schema must be either "
|
||||
f"a Pydantic object, a dictionary or a string that contains "
|
||||
f"the JSON Schema specification")
|
||||
regex_string = build_regex_from_schema(schema_str, whitespace_pattern)
|
||||
super().__init__(regex_string, tokenizer, reasoner)
|
||||
|
||||
|
||||
class CFGLogitsProcessor(BaseLogitsProcessor):
|
||||
|
||||
@classmethod
|
||||
@cache()
|
||||
def _get_guide(cls, cfg: str, tokenizer: PreTrainedTokenizerBase) -> Guide:
|
||||
tokenizer = _adapt_tokenizer(tokenizer)
|
||||
return CFGGuide(cfg, tokenizer)
|
||||
|
||||
def __init__(self, cfg: str, tokenizer: PreTrainedTokenizerBase,
|
||||
reasoner: Optional[ReasoningParser]):
|
||||
"""Compile the FSM that drives the context free grammar generation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cfg
|
||||
A string that represents a context-free grammar
|
||||
tokenizer
|
||||
The model's tokenizer
|
||||
|
||||
"""
|
||||
super().__init__(CFGLogitsProcessor._get_guide(cfg, tokenizer),
|
||||
reasoner)
|
||||
self._guide = self._guide.copy()
|
||||
|
||||
def clone(self) -> "CFGLogitsProcessor":
|
||||
cloned = copy.copy(self)
|
||||
cloned._fsm_state = copy.deepcopy(self._fsm_state)
|
||||
cloned._guide = self._guide.copy()
|
||||
return cloned
|
||||
|
||||
|
||||
@lru_cache(maxsize=32)
|
||||
def _adapt_tokenizer(tokenizer: PreTrainedTokenizerBase):
|
||||
"""Adapt vLLM's tokenizer to use to compile the FSM.
|
||||
|
||||
The API of Outlines tokenizers is slightly different to that of
|
||||
`transformers`. The decoder of outlines, returns a list whereas
|
||||
the decode of vLLM returns an str. To sync the vLLM decoder with
|
||||
outlines internal api, the decoder should be adapted. In addition
|
||||
we need to handle the missing spaces to Llama's tokenizer to be
|
||||
able to compile FSMs for this model.
|
||||
|
||||
"""
|
||||
if getattr(tokenizer, "_outlines_adapted", False):
|
||||
return tokenizer
|
||||
|
||||
tokenizer = copy.deepcopy(tokenizer)
|
||||
|
||||
tokenizer.vocabulary = tokenizer.get_vocab()
|
||||
tokenizer.special_tokens = set(tokenizer.all_special_tokens)
|
||||
|
||||
def convert_token_to_string(token: str) -> str:
|
||||
from transformers.file_utils import SPIECE_UNDERLINE
|
||||
|
||||
string = tokenizer.convert_tokens_to_string([token])
|
||||
|
||||
# A hack to handle missing spaces to HF's Llama tokenizers
|
||||
if (type(token) is str and token.startswith(SPIECE_UNDERLINE)
|
||||
or token == "<0x20>"):
|
||||
return " " + string
|
||||
|
||||
return string
|
||||
|
||||
def change_decoder(
|
||||
decoder: Callable[[list[int]],
|
||||
str]) -> Callable[[list[int]], list[str]]:
|
||||
"""Sync vLLM's decoder with the outlines by returning list."""
|
||||
|
||||
def new_decoder(inp_tokens: list[int]) -> list[str]:
|
||||
if (isinstance(inp_tokens, list) and len(inp_tokens) == 1
|
||||
and isinstance(inp_tokens[0], list)):
|
||||
inp_tokens = inp_tokens[0]
|
||||
return [decoder(inp_tokens)]
|
||||
|
||||
return new_decoder
|
||||
|
||||
tokenizer.convert_token_to_string = convert_token_to_string
|
||||
tokenizer.decode = change_decoder(tokenizer.decode)
|
||||
setattr(tokenizer, "_outlines_adapted", True) # noqa: B010
|
||||
|
||||
return tokenizer
|
||||
242
model_executor/guided_decoding/utils.py
Normal file
242
model_executor/guided_decoding/utils.py
Normal file
@@ -0,0 +1,242 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import regex as re
|
||||
|
||||
|
||||
def has_xgrammar_unsupported_json_features(schema: dict) -> bool:
|
||||
"""Check if JSON schema contains features unsupported by xgrammar."""
|
||||
|
||||
def check_object(obj: dict) -> bool:
|
||||
if not isinstance(obj, dict):
|
||||
return False
|
||||
|
||||
# Check for numeric ranges
|
||||
if obj.get("type") in ("integer", "number") and ("multipleOf" in obj):
|
||||
return True
|
||||
|
||||
# Check for array unsupported keywords
|
||||
if obj.get("type") == "array" and any(key in obj for key in [
|
||||
"uniqueItems", "contains", "minContains", "maxContains",
|
||||
"minItems", "maxItems"
|
||||
]):
|
||||
return True
|
||||
|
||||
# Unsupported keywords for strings
|
||||
if obj.get("type") == "string" and any(
|
||||
key in obj for key in ["minLength", "maxLength", "format"]):
|
||||
return True
|
||||
|
||||
# Unsupported keywords for objects
|
||||
if obj.get("type") == "object" and any(key in obj for key in [
|
||||
"minProperties", "maxProperties", "propertyNames",
|
||||
"patternProperties"
|
||||
]):
|
||||
return True
|
||||
|
||||
# Recursively check all nested objects and arrays
|
||||
for value in obj.values():
|
||||
if isinstance(value, dict):
|
||||
if check_object(value):
|
||||
return True
|
||||
elif isinstance(value, list):
|
||||
for item in value:
|
||||
if isinstance(item, dict) and check_object(item):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
return check_object(schema)
|
||||
|
||||
|
||||
def has_lmf_unsupported_json_features(schema: dict) -> bool:
|
||||
"""
|
||||
Check if JSON schema contains features unsupported
|
||||
by lm_format_enforcer.
|
||||
|
||||
Known issues:
|
||||
- Regex patterns:
|
||||
"grade": {
|
||||
"type": "string",
|
||||
"pattern": "^[A-D]$" # Regex pattern
|
||||
},
|
||||
"""
|
||||
|
||||
def check_object(obj: dict) -> bool:
|
||||
if not isinstance(obj, dict):
|
||||
return False
|
||||
|
||||
# Check for pattern restrictions
|
||||
if "pattern" in obj:
|
||||
return True
|
||||
|
||||
# Recursively check all nested objects and arrays
|
||||
for value in obj.values():
|
||||
if isinstance(value, dict):
|
||||
if check_object(value):
|
||||
return True
|
||||
elif isinstance(value, list):
|
||||
for item in value:
|
||||
if isinstance(item, dict) and check_object(item):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
return check_object(schema)
|
||||
|
||||
|
||||
def grammar_is_likely_lark(grammar_str: str) -> bool:
|
||||
"""
|
||||
Check if grammar appears to use Lark syntax.
|
||||
|
||||
Args:
|
||||
grammar_str: Input grammar string
|
||||
|
||||
Returns:
|
||||
bool: True if grammar appears to be in Lark format, False otherwise
|
||||
|
||||
Examples:
|
||||
>>> grammar_is_likely_lark("rule: 'abc'")
|
||||
True
|
||||
>>> grammar_is_likely_lark("rule ::= 'abc'")
|
||||
False
|
||||
"""
|
||||
if not grammar_str or not isinstance(grammar_str, str):
|
||||
return False
|
||||
|
||||
for line in grammar_str.split('\n'):
|
||||
# Remove both comment styles
|
||||
line = re.sub(r'(#|//).*$', '', line).strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# Look for GBNF rule definition
|
||||
if '::=' in line:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def convert_lark_to_gbnf(grammar_str: str) -> str:
|
||||
"""
|
||||
Convert a Lark grammar string to GBNF format.
|
||||
|
||||
GBNF reference:
|
||||
https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md
|
||||
Lark grammar reference:
|
||||
https://lark-parser.readthedocs.io/en/latest/grammar.html
|
||||
|
||||
Args:
|
||||
grammar_str: Input grammar in Lark format
|
||||
|
||||
Returns:
|
||||
str: Converted grammar in GBNF format
|
||||
|
||||
Examples:
|
||||
>>> print(convert_lark_to_gbnf("rule: 'hello'"))
|
||||
root ::= rule
|
||||
rule ::= "hello"
|
||||
"""
|
||||
if not isinstance(grammar_str, str):
|
||||
raise ValueError(f"Grammar must be a string, got {type(grammar_str)}")
|
||||
if not grammar_str.strip():
|
||||
raise ValueError("Grammar string cannot be empty")
|
||||
|
||||
defined_rules = set()
|
||||
referenced_rules = set()
|
||||
output_lines = []
|
||||
|
||||
def clean_line(line: str) -> str:
|
||||
"""Remove comments and whitespace from line."""
|
||||
return re.sub(r'(#|//).*$', '', line).strip()
|
||||
|
||||
def check_quotes(text: str, rule_name: str, line_num: int) -> None:
|
||||
"""Validate quote matching in text."""
|
||||
if text.count("'") % 2 != 0 or text.count('"') % 2 != 0:
|
||||
raise ValueError(
|
||||
f"Mismatched quotes in {rule_name} on line {line_num}")
|
||||
|
||||
def extract_references(text: str) -> set:
|
||||
"""Extract rule references from text."""
|
||||
# Remove quoted strings and special characters
|
||||
text = re.sub(r'"[^"]*"', '', text)
|
||||
text = re.sub(r'[+*?()|\[\]{}]', ' ', text)
|
||||
return set(re.findall(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', text))
|
||||
|
||||
# First pass: Find root rule and validate rule definitions
|
||||
lines = [clean_line(line) for line in grammar_str.split('\n')]
|
||||
first_rule = None
|
||||
|
||||
for line_num, line in enumerate(lines, 1):
|
||||
if not line or line.startswith('|'):
|
||||
continue
|
||||
|
||||
if ':' in line:
|
||||
try:
|
||||
name = line.split(':', 1)[0].strip().strip('?')
|
||||
defined_rules.add(name)
|
||||
if first_rule is None:
|
||||
first_rule = name
|
||||
if name == 'start':
|
||||
first_rule = 'start'
|
||||
except IndexError as e:
|
||||
raise ValueError(f"Invalid rule format on line {line_num}. "
|
||||
"Expected 'rule_name: definition'") from e
|
||||
|
||||
if not defined_rules:
|
||||
raise ValueError("No valid rules found in grammar")
|
||||
|
||||
# Add root rule
|
||||
output_lines.append(f"root ::= {first_rule}")
|
||||
|
||||
# Second pass: Process rule definitions and alternatives
|
||||
current_rule = None
|
||||
current_definition = []
|
||||
|
||||
for line_num, line in enumerate(lines, 1):
|
||||
if not line:
|
||||
continue
|
||||
|
||||
try:
|
||||
if ':' in line and not line.startswith('|'):
|
||||
# Save previous rule if exists
|
||||
if current_rule:
|
||||
output_lines.append(
|
||||
f"{current_rule} ::= {' | '.join(current_definition)}")
|
||||
|
||||
# Process new rule
|
||||
name, definition = line.split(':', 1)
|
||||
current_rule = name.strip().strip('?')
|
||||
|
||||
check_quotes(definition, f"rule '{current_rule}'", line_num)
|
||||
definition = re.sub(r"'([^']*)'", r'"\1"', definition)
|
||||
referenced_rules.update(extract_references(definition))
|
||||
current_definition = [definition.strip()]
|
||||
|
||||
elif line.startswith('|'):
|
||||
if not current_rule:
|
||||
raise ValueError(f"Alternative '|' on line {line_num} "
|
||||
"without a preceding rule definition")
|
||||
|
||||
alt_def = line[1:].strip()
|
||||
check_quotes(alt_def, f"alternative for rule '{current_rule}'",
|
||||
line_num)
|
||||
alt_def = re.sub(r"'([^']*)'", r'"\1"', alt_def)
|
||||
referenced_rules.update(extract_references(alt_def))
|
||||
current_definition.append(alt_def)
|
||||
|
||||
except ValueError as e:
|
||||
raise ValueError(f"Error on line {line_num}: {str(e)}") from e
|
||||
|
||||
# Add final rule if exists
|
||||
if current_rule:
|
||||
output_lines.append(
|
||||
f"{current_rule} ::= {' | '.join(current_definition)}")
|
||||
|
||||
# Validate all rules are defined
|
||||
undefined_rules = referenced_rules - defined_rules - {'root'}
|
||||
if undefined_rules:
|
||||
raise ValueError("Referenced rules are not defined: "
|
||||
f"{', '.join(sorted(undefined_rules))}")
|
||||
|
||||
return '\n'.join(output_lines)
|
||||
426
model_executor/guided_decoding/xgrammar_decoding.py
Normal file
426
model_executor/guided_decoding/xgrammar_decoding.py
Normal file
@@ -0,0 +1,426 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# noqa: UP007
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import regex as re
|
||||
import torch
|
||||
|
||||
import vllm.envs
|
||||
from vllm.logger import init_logger
|
||||
|
||||
try:
|
||||
import xgrammar as xgr
|
||||
xgr_installed = True
|
||||
except ImportError:
|
||||
xgr_installed = False
|
||||
pass
|
||||
|
||||
from vllm.model_executor.guided_decoding.utils import (convert_lark_to_gbnf,
|
||||
grammar_is_likely_lark)
|
||||
from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from transformers import PreTrainedTokenizer
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.reasoning import ReasoningParser
|
||||
from vllm.sampling_params import GuidedDecodingParams
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def get_local_xgrammar_guided_decoding_logits_processor(
|
||||
guided_params: GuidedDecodingParams,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
model_config: ModelConfig,
|
||||
reasoner: ReasoningParser | None,
|
||||
max_threads: int = 8):
|
||||
config = GrammarConfig.from_guided_params(guided_params=guided_params,
|
||||
model_config=model_config,
|
||||
tokenizer=tokenizer,
|
||||
max_threads=max_threads)
|
||||
return XGrammarLogitsProcessor(config, reasoner)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class TokenizerData:
|
||||
"""Immutable container for cached tokenizer data."""
|
||||
metadata: str
|
||||
encoded_vocab: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
class TokenizerDataCache:
|
||||
"""Cache manager for tokenizer data to avoid repeated processing."""
|
||||
_cache: dict[int, TokenizerData] = {}
|
||||
|
||||
@classmethod
|
||||
def get_tokenizer_data(
|
||||
cls,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
/,
|
||||
*,
|
||||
tokenizer_hash: int,
|
||||
vocab_size: int,
|
||||
) -> TokenizerData:
|
||||
|
||||
if tokenizer_hash not in cls._cache:
|
||||
tokenizer_info = xgr.TokenizerInfo.from_huggingface(
|
||||
tokenizer,
|
||||
# NOTE: We will need to use lm_head's vocab_size
|
||||
# to determine correct special_token_ids for this tokenizer.
|
||||
# See https://github.com/mlc-ai/xgrammar/commit/70c959fb6d9cea75aae33c414763cd0602022d92 # noqa: E501
|
||||
vocab_size=vocab_size,
|
||||
)
|
||||
metadata = json.loads(tokenizer_info.dump_metadata())
|
||||
|
||||
# Vendored from xgrammar logic to get encoded_vocab
|
||||
# https://github.com/mlc-ai/xgrammar/blob/989222175c2a30fb7987d8bcce35bec1bf6817f2/python/xgrammar/tokenizer_info.py#L127 # noqa: E501
|
||||
try:
|
||||
vocab_dict = tokenizer.get_vocab()
|
||||
except AttributeError as e:
|
||||
raise ValueError(
|
||||
f"Cannot get the vocabulary of the tokenizer "
|
||||
f"{type(tokenizer)}. The tokenizer should have a "
|
||||
"get_vocab method.") from e
|
||||
|
||||
# maintain tokenizer's indexing
|
||||
encoded_vocab = [""] * tokenizer_info.vocab_size
|
||||
for token, idx in vocab_dict.items():
|
||||
if idx < tokenizer_info.vocab_size:
|
||||
encoded_vocab[idx] = token
|
||||
|
||||
if isinstance(tokenizer, MistralTokenizer):
|
||||
# REF: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501
|
||||
metadata.update({
|
||||
"vocab_type": xgr.VocabType.BYTE_FALLBACK,
|
||||
"add_prefix_space": True
|
||||
})
|
||||
|
||||
cls._cache[tokenizer_hash] = TokenizerData(
|
||||
encoded_vocab=encoded_vocab,
|
||||
metadata=json.dumps(metadata),
|
||||
)
|
||||
|
||||
return cls._cache[tokenizer_hash]
|
||||
|
||||
|
||||
class GrammarCompilerCache:
|
||||
"""
|
||||
Cache for GrammarCompiler instances based on tokenizer.
|
||||
|
||||
This cache reduces the overhead of creating new compiler instances when
|
||||
using the same tokenizer configuration.
|
||||
"""
|
||||
_cache: dict[str, xgr.GrammarCompiler] = {}
|
||||
|
||||
@classmethod
|
||||
def get_compiler(cls, config: GrammarConfig) -> xgr.GrammarCompiler:
|
||||
cache_key = str(config.tokenizer_hash)
|
||||
|
||||
if cache_key not in cls._cache:
|
||||
config_data = config.tokenizer_data
|
||||
|
||||
# In TokenizerDataCache.get_tokenizer_data, a serializable
|
||||
# tokenizer_data is created and cached. This data is used to build
|
||||
# a tokenizer_info and create an xgrammar compiler.
|
||||
tokenizer_info = xgr.TokenizerInfo.from_vocab_and_metadata(
|
||||
encoded_vocab=config_data.encoded_vocab,
|
||||
metadata=config_data.metadata,
|
||||
)
|
||||
cache_size = vllm.envs.VLLM_XGRAMMAR_CACHE_MB * 1024 * 1024
|
||||
cls._cache[cache_key] = xgr.GrammarCompiler(
|
||||
tokenizer_info,
|
||||
max_threads=config.max_threads,
|
||||
cache_enabled=True,
|
||||
cache_limit_bytes=cache_size,
|
||||
)
|
||||
|
||||
return cls._cache[cache_key]
|
||||
|
||||
|
||||
@dataclass
|
||||
class GrammarConfig:
|
||||
"""Serializable configuration for grammar compilation"""
|
||||
tokenizer_hash: int
|
||||
tokenizer_data: TokenizerData
|
||||
json_str: str | None = None
|
||||
grammar_str: str | None = None
|
||||
json_object: bool | None = None
|
||||
any_whitespace: bool = True
|
||||
regex_str: str | None = None
|
||||
max_threads: int = 8
|
||||
|
||||
@classmethod
|
||||
def from_guided_params(cls,
|
||||
guided_params: GuidedDecodingParams,
|
||||
model_config: ModelConfig,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
max_threads: int = 8) -> GrammarConfig:
|
||||
|
||||
tokenizer_hash = hash(tokenizer)
|
||||
tokenizer_data = TokenizerDataCache.get_tokenizer_data(
|
||||
tokenizer,
|
||||
tokenizer_hash=tokenizer_hash,
|
||||
vocab_size=model_config.hf_text_config.vocab_size,
|
||||
)
|
||||
|
||||
if guided_params.json:
|
||||
if not isinstance(guided_params.json, str):
|
||||
json_str = json.dumps(guided_params.json)
|
||||
else:
|
||||
json_str = guided_params.json
|
||||
|
||||
any_whitespace = not guided_params.disable_any_whitespace
|
||||
|
||||
# Check and log if model with xgrammar and whitespace have history
|
||||
# of runaway generation of whitespaces.
|
||||
# References:
|
||||
# https://github.com/vllm-project/vllm/pull/12744
|
||||
# https://github.com/mlc-ai/xgrammar/issues/212
|
||||
model_with_warn = None
|
||||
|
||||
if 'Mistral' in model_config.model:
|
||||
model_with_warn = 'Mistral'
|
||||
elif 'Qwen' in model_config.model:
|
||||
model_with_warn = 'Qwen'
|
||||
|
||||
if model_with_warn is not None and any_whitespace:
|
||||
logger.info_once(
|
||||
"%s model detected, consider setting `disable_any_whitespace` to prevent runaway generation of whitespaces.", # noqa: E501
|
||||
model_with_warn,
|
||||
)
|
||||
# Validate the schema and raise ValueError here if it is invalid.
|
||||
# This is to avoid exceptions in model execution, which will crash
|
||||
# the engine worker process.
|
||||
try:
|
||||
xgr.Grammar.from_json_schema(json_str,
|
||||
any_whitespace=any_whitespace)
|
||||
except RuntimeError as err:
|
||||
raise ValueError(str(err)) from err
|
||||
|
||||
return cls(json_str=json_str,
|
||||
tokenizer_hash=tokenizer_hash,
|
||||
max_threads=max_threads,
|
||||
tokenizer_data=tokenizer_data,
|
||||
any_whitespace=any_whitespace)
|
||||
elif guided_params.grammar:
|
||||
# XGrammar only supports GBNF grammars, so we must convert Lark
|
||||
if grammar_is_likely_lark(guided_params.grammar):
|
||||
try:
|
||||
grammar_str = convert_lark_to_gbnf(guided_params.grammar)
|
||||
except ValueError as e:
|
||||
raise ValueError(
|
||||
"Failed to convert the grammar from Lark to GBNF. "
|
||||
"Please either use GBNF grammar directly or specify"
|
||||
" --guided-decoding-backend=outlines.\n"
|
||||
f"Conversion error: {str(e)}") from e
|
||||
else:
|
||||
grammar_str = guided_params.grammar
|
||||
|
||||
# Validate the grammar and raise ValueError here if it is invalid.
|
||||
# This is to avoid exceptions in model execution, which will crash
|
||||
# the engine worker process.
|
||||
try:
|
||||
xgr.Grammar.from_ebnf(grammar_str)
|
||||
except RuntimeError as err:
|
||||
raise ValueError(str(err)) from err
|
||||
|
||||
return cls(grammar_str=grammar_str,
|
||||
tokenizer_hash=tokenizer_hash,
|
||||
max_threads=max_threads,
|
||||
tokenizer_data=tokenizer_data)
|
||||
elif guided_params.json_object:
|
||||
return cls(
|
||||
json_object=True,
|
||||
tokenizer_hash=tokenizer_hash,
|
||||
max_threads=max_threads,
|
||||
tokenizer_data=tokenizer_data,
|
||||
)
|
||||
elif guided_params.choice:
|
||||
choice_str = GrammarConfig.choice_as_grammar(guided_params.choice)
|
||||
try:
|
||||
xgr.Grammar.from_ebnf(choice_str)
|
||||
except RuntimeError as err:
|
||||
raise ValueError(str(err)) from err
|
||||
|
||||
return cls(
|
||||
grammar_str=choice_str,
|
||||
tokenizer_hash=tokenizer_hash,
|
||||
max_threads=max_threads,
|
||||
tokenizer_data=tokenizer_data,
|
||||
)
|
||||
elif guided_params.regex:
|
||||
return cls(
|
||||
regex_str=guided_params.regex,
|
||||
tokenizer_hash=tokenizer_hash,
|
||||
max_threads=max_threads,
|
||||
tokenizer_data=tokenizer_data,
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Currently only support JSON and EBNF grammar mode for xgrammar"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def escape_ebnf_string(s: str) -> str:
|
||||
"""Escape special characters in a EBNF string."""
|
||||
# Escape double quotes and backslashes
|
||||
return re.sub(r'(["\\])', r'\\\1', s)
|
||||
|
||||
@staticmethod
|
||||
def choice_as_grammar(choice: list[str] | None) -> str:
|
||||
if choice is None:
|
||||
raise ValueError("Choice is not set")
|
||||
escaped_choices = (GrammarConfig.escape_ebnf_string(c) for c in choice)
|
||||
grammar = ('root ::= ' + ' | '.join(f'"{c}"' for c in escaped_choices))
|
||||
return grammar
|
||||
|
||||
@staticmethod
|
||||
def tokenizer_info(tokenizer_data: TokenizerData) -> xgr.TokenizerInfo:
|
||||
return xgr.TokenizerInfo.from_vocab_and_metadata(
|
||||
encoded_vocab=tokenizer_data.encoded_vocab,
|
||||
metadata=tokenizer_data.metadata,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class XGrammarLogitsProcessor:
|
||||
"""Wrapper class to support pickle protocol"""
|
||||
config: GrammarConfig
|
||||
reasoner: ReasoningParser | None = None
|
||||
|
||||
ctx: xgr.CompiledGrammar | None = None
|
||||
tokenizer_info: xgr.TokenizerInfo = None # type: ignore[assignment]
|
||||
token_bitmask: torch.Tensor = None # type: ignore[assignment]
|
||||
matchers: list[xgr.GrammarMatcher] = field(default_factory=list)
|
||||
batch_size: int = field(default=1)
|
||||
prefilled: bool = field(default=False)
|
||||
|
||||
def __post_init__(self):
|
||||
if self.tokenizer_info is None:
|
||||
self.tokenizer_info = self.config.tokenizer_info(
|
||||
self.config.tokenizer_data)
|
||||
|
||||
def __getstate__(self) -> dict[str, Any]:
|
||||
return {'config': self.config, 'reasoner': self.reasoner}
|
||||
|
||||
def __setstate__(self, state: dict[str, Any]):
|
||||
self.config = state['config']
|
||||
self.reasoner = state['reasoner']
|
||||
|
||||
self.tokenizer_info = GrammarConfig.tokenizer_info(
|
||||
self.config.tokenizer_data)
|
||||
self.ctx = None
|
||||
self.matchers = []
|
||||
self.batch_size = 1
|
||||
self.token_bitmask = None # type: ignore[assignment]
|
||||
self.prefilled = False
|
||||
|
||||
def _ensure_ctx(self):
|
||||
"""Lazily initialize the processor in the worker process"""
|
||||
if self.ctx is None:
|
||||
compiler = GrammarCompilerCache.get_compiler(self.config)
|
||||
if self.config.json_str is not None:
|
||||
any_whitespace = self.config.any_whitespace
|
||||
self.ctx = compiler\
|
||||
.compile_json_schema(self.config.json_str,
|
||||
any_whitespace=any_whitespace)
|
||||
elif self.config.grammar_str is not None:
|
||||
self.ctx = compiler.compile_grammar(self.config.grammar_str)
|
||||
elif self.config.json_object:
|
||||
any_whitespace = self.config.any_whitespace
|
||||
self.ctx = compiler\
|
||||
.compile_json_schema('{"type": "object"}',
|
||||
any_whitespace=any_whitespace)
|
||||
elif self.config.regex_str:
|
||||
self.ctx = compiler.compile_regex(self.config.regex_str)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Invalid configuration for xgrammar logits processor")
|
||||
|
||||
def __call__(self, input_ids: list[int],
|
||||
scores: torch.Tensor) -> torch.Tensor:
|
||||
|
||||
# Skip the structured logits processing if reasoning is not finished.
|
||||
# reasoner is not None only when `--reasoning-parser` is set.
|
||||
if self.reasoner is not None and \
|
||||
not self.reasoner.is_reasoning_end(
|
||||
input_ids):
|
||||
return scores
|
||||
|
||||
if self.ctx is None:
|
||||
self._ensure_ctx()
|
||||
|
||||
if len(self.matchers) == 0:
|
||||
self.matchers = [
|
||||
xgr.GrammarMatcher(self.ctx) for _ in range(self.batch_size)
|
||||
]
|
||||
self.token_bitmask = xgr.allocate_token_bitmask(
|
||||
self.batch_size, self.tokenizer_info.vocab_size)
|
||||
|
||||
if not self.prefilled:
|
||||
# Have not sampled a token yet
|
||||
self.prefilled = True
|
||||
else:
|
||||
for i, matcher in enumerate(self.matchers):
|
||||
if not matcher.is_terminated():
|
||||
sampled_token = input_ids[-1]
|
||||
assert self.matchers[i].accept_token(sampled_token)
|
||||
|
||||
for i, matcher in enumerate(self.matchers):
|
||||
if not matcher.is_terminated():
|
||||
# @ubospica: ideally, fill_next_token_bitmask should be
|
||||
# parallelized with model decoding
|
||||
# See https://github.com/vllm-project/vllm/pull/10785/files#r1864278303
|
||||
matcher.fill_next_token_bitmask(self.token_bitmask, i)
|
||||
|
||||
# token_bitmask is a CPU tensor for use with accept_token and
|
||||
# fill_next_token_bitmask so we move it to the device of scores
|
||||
device_type = scores.device.type
|
||||
dtype = scores.dtype
|
||||
if device_type != "cuda":
|
||||
# xgrammar on cpu only supports float32 scores
|
||||
# see: https://github.com/mlc-ai/xgrammar/blob/c1b64920cad24f44f235778c1c00bb52d57da01a/python/xgrammar/kernels/apply_token_bitmask_inplace_cpu.py#L22
|
||||
scores = scores.to("cpu").float().unsqueeze(0)
|
||||
|
||||
# Note: In this method, if the tensors have different dimensions
|
||||
# on CPU device fails, but on GPU it runs without error. Hence the
|
||||
# unsqueeze above for scores, to match the token bitmask shape
|
||||
xgr.apply_token_bitmask_inplace(
|
||||
scores, self.token_bitmask.to(scores.device, non_blocking=True))
|
||||
if device_type != "cuda":
|
||||
scores = scores.to(dtype).to(device_type).squeeze()
|
||||
|
||||
return scores
|
||||
|
||||
def clone(self) -> XGrammarLogitsProcessor:
|
||||
"""Create a new instance with shared compiled grammar
|
||||
but separate state"""
|
||||
new_processor = XGrammarLogitsProcessor(self.config, self.reasoner,
|
||||
None, self.tokenizer_info)
|
||||
|
||||
# Share the compiled grammar context (immutable after compilation)
|
||||
new_processor.ctx = self.ctx
|
||||
|
||||
# Create fresh matchers for the new sequence
|
||||
if self.ctx is not None:
|
||||
new_processor.matchers = [
|
||||
xgr.GrammarMatcher(self.ctx) for _ in range(self.batch_size)
|
||||
]
|
||||
|
||||
# Create a new token bitmask with the same size
|
||||
if hasattr(self, 'token_bitmask') and self.token_bitmask is not None:
|
||||
new_processor.token_bitmask = self.token_bitmask
|
||||
|
||||
# Copy simple attributes
|
||||
new_processor.batch_size = self.batch_size
|
||||
# Reset prefilled state for new sequence
|
||||
new_processor.prefilled = False
|
||||
|
||||
return new_processor
|
||||
Reference in New Issue
Block a user