add qwen3

2026-02-04 17:22:39 +08:00
parent d1c0f68ab4
commit 8511fe8530
1932 changed files with 300426 additions and 0 deletions
--- a/vllm-v0.6.2/vllm/model_executor/guided_decoding/init.py
+++ b/vllm-v0.6.2/vllm/model_executor/guided_decoding/init.py
@@ -0,0 +1,46 @@
+from typing import Optional
+
+from vllm.logits_process import LogitsProcessor
+from vllm.sampling_params import GuidedDecodingParams
+
+
+async def get_guided_decoding_logits_processor(
+        guided_params: GuidedDecodingParams,
+        tokenizer) -> Optional[LogitsProcessor]:
+    # CFG grammar not supported by LMFE, so we use outlines instead
+    if guided_params.backend == 'outlines' or guided_params.grammar:
+        # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
+        from vllm.model_executor.guided_decoding.outlines_decoding import (  # noqa
+            get_outlines_guided_decoding_logits_processor)
+        return await get_outlines_guided_decoding_logits_processor(
+            guided_params, tokenizer)
+    if guided_params.backend == 'lm-format-enforcer':
+        from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import (  # noqa
+            get_local_lm_format_enforcer_guided_decoding_logits_processor)
+        return get_local_lm_format_enforcer_guided_decoding_logits_processor(
+            guided_params, tokenizer)
+
+    raise ValueError(
+        f"Unknown guided decoding backend '{guided_params.backend}'. "
+        "Must be one of 'outlines, 'lm-format-enforcer'")
+
+
+def get_local_guided_decoding_logits_processor(
+        guided_params: GuidedDecodingParams,
+        tokenizer) -> Optional[LogitsProcessor]:
+    # CFG grammar not supported by LMFE, so we use outlines instead
+    if guided_params.backend == 'outlines' or guided_params.grammar:
+        # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
+        from vllm.model_executor.guided_decoding.outlines_decoding import (  # noqa
+            get_local_outlines_guided_decoding_logits_processor)
+        return get_local_outlines_guided_decoding_logits_processor(
+            guided_params, tokenizer)
+    if guided_params.backend == 'lm-format-enforcer':
+        from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import (  # noqa
+            get_local_lm_format_enforcer_guided_decoding_logits_processor)
+        return get_local_lm_format_enforcer_guided_decoding_logits_processor(
+            guided_params, tokenizer)
+
+    raise ValueError(
+        f"Unknown guided decoding backend '{guided_params.backend}'. "
+        "Must be one of 'outlines, 'lm-format-enforcer'")
--- a/vllm-v0.6.2/vllm/model_executor/guided_decoding/pycache/init.cpython-310.pyc
+++ b/vllm-v0.6.2/vllm/model_executor/guided_decoding/pycache/init.cpython-310.pyc
--- a/vllm-v0.6.2/vllm/model_executor/guided_decoding/pycache/guided_fields.cpython-310.pyc
+++ b/vllm-v0.6.2/vllm/model_executor/guided_decoding/pycache/guided_fields.cpython-310.pyc
--- a/vllm-v0.6.2/vllm/model_executor/guided_decoding/guided_fields.py
+++ b/vllm-v0.6.2/vllm/model_executor/guided_decoding/guided_fields.py
@@ -0,0 +1,39 @@
+from dataclasses import dataclass
+from typing import Dict, List, Optional, TypedDict, Union
+
+from pydantic import BaseModel
+
+
+# These classes are deprecated, see SamplingParams
+class LLMGuidedOptions(TypedDict, total=False):
+    guided_json: Union[Dict, BaseModel, str]
+    guided_regex: str
+    guided_choice: List[str]
+    guided_grammar: str
+    guided_decoding_backend: str
+    guided_whitespace_pattern: str
+    guided_json_object: bool
+
+
+@dataclass
+class GuidedDecodingRequest:
+    """One of the fields will be used to retrieve the logit processor."""
+    guided_json: Optional[Union[Dict, BaseModel, str]] = None
+    guided_regex: Optional[str] = None
+    guided_choice: Optional[List[str]] = None
+    guided_grammar: Optional[str] = None
+    guided_decoding_backend: Optional[str] = None
+    guided_whitespace_pattern: Optional[str] = None
+    guided_json_object: Optional[bool] = None
+
+    def __post_init__(self):
+        """Validate that some fields are mutually exclusive."""
+        guide_count = sum([
+            self.guided_json is not None, self.guided_regex is not None,
+            self.guided_choice is not None, self.guided_grammar is not None,
+            self.guided_json_object is not None
+        ])
+        if guide_count > 1:
+            raise ValueError(
+                "You can only use one kind of guided decoding but multiple are "
+                f"specified: {self.__dict__}")
--- a/vllm-v0.6.2/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
+++ b/vllm-v0.6.2/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
@@ -0,0 +1,64 @@
+from functools import lru_cache
+from json import loads as json_loads
+from typing import Optional, Union
+
+from lmformatenforcer import (CharacterLevelParser, JsonSchemaParser,
+                              RegexParser, StringParser,
+                              TokenEnforcerTokenizerData, UnionParser)
+from lmformatenforcer.integrations.vllm import (
+    build_vllm_logits_processor, build_vllm_token_enforcer_tokenizer_data)
+from transformers import PreTrainedTokenizerBase
+
+from vllm.logits_process import LogitsProcessor
+from vllm.sampling_params import GuidedDecodingParams
+
+
+def get_local_lm_format_enforcer_guided_decoding_logits_processor(
+        guided_params: GuidedDecodingParams,
+        tokenizer) -> Optional[LogitsProcessor]:
+    """
+    Given an OpenAI-compatible request, check for guided decoding parameters
+    and get the necessary logits processor for the given guide.
+    We cache logit processors by (guide, tokenizer), and on cache hit
+    we make a shallow copy to reuse the same underlying FSM.
+    """
+
+    tokenizer_data = _cached_build_vllm_token_enforcer_tokenizer_data(
+        tokenizer)
+    character_level_parser: CharacterLevelParser
+    if guided_params.json:
+        schema_dict = _normalize_json_schema_object(guided_params.json)
+        character_level_parser = JsonSchemaParser(schema_dict)
+    elif guided_params.choice:
+        character_level_parser = UnionParser(
+            [StringParser(choice) for choice in guided_params.choice])
+    elif guided_params.regex:
+        character_level_parser = RegexParser(guided_params.regex)
+    elif guided_params.grammar:
+        # CFG grammar not supported by LMFE
+        raise ValueError("Cannot construct a guided decoding logits processor"
+                         " using the grammar option with the"
+                         " lm_format_enforcer backend.")
+    elif guided_params.json_object:
+        # None means any json object
+        character_level_parser = JsonSchemaParser(None)
+    else:
+        return None
+
+    logits_processor = build_vllm_logits_processor(tokenizer_data,
+                                                   character_level_parser)
+    return logits_processor
+
+
+def _normalize_json_schema_object(schema: Union[str, dict]) -> dict:
+    if isinstance(schema, str):
+        return json_loads(schema)
+    if isinstance(schema, dict):
+        return schema
+    raise AssertionError(f"Unsupported schema type {schema}")
+
+
+@lru_cache
+def _cached_build_vllm_token_enforcer_tokenizer_data(
+        tokenizer: PreTrainedTokenizerBase) -> TokenEnforcerTokenizerData:
+    return build_vllm_token_enforcer_tokenizer_data(tokenizer)
--- a/vllm-v0.6.2/vllm/model_executor/guided_decoding/outlines_decoding.py
+++ b/vllm-v0.6.2/vllm/model_executor/guided_decoding/outlines_decoding.py
@@ -0,0 +1,133 @@
+import asyncio
+import concurrent.futures
+from enum import Enum
+from json import dumps as json_dumps
+from re import escape as regex_escape
+from typing import Tuple, Union
+
+from transformers import PreTrainedTokenizerBase
+
+from vllm.model_executor.guided_decoding.outlines_logits_processors import (
+    CFGLogitsProcessor, JSONLogitsProcessor, RegexLogitsProcessor)
+from vllm.sampling_params import GuidedDecodingParams
+
+
+class GuidedDecodingMode(Enum):
+    JSON = "json"
+    REGEX = "regex"
+    CHOICE = "choice"
+    GRAMMAR = "grammar"
+
+
+# https://github.com/outlines-dev/outlines/blob/main/outlines/grammars/json.lark
+# the main difference is that we changed the start: value to
+# start: object | array, so we are denying scalar values as the root of the
+# JSON. Starting with scalars as the root seems to cause llama to generate
+# without stop.
+JSON_GRAMMAR = r"""
+?start: object | array
+
+?value: object
+| array
+| UNESCAPED_STRING
+| SIGNED_NUMBER      -> number
+| "true"             -> true
+| "false"            -> false
+| "null"             -> null
+
+array  : "[" [value ("," value)*] "]"
+object : "{" [pair ("," pair)*] "}"
+pair   : UNESCAPED_STRING ":" value
+
+%import common.UNESCAPED_STRING
+%import common.SIGNED_NUMBER
+%import common.WS
+
+%ignore WS
+"""
+
+global_thread_pool = None  # used for generating logits processor fsm
+
+
+async def get_outlines_guided_decoding_logits_processor(
+    guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizerBase
+) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor,
+           None]:
+    """
+    Given an OpenAI-compatible request, check for guided decoding parameters
+    and get the necessary logits processor for the given guide.
+    We cache logit processors by (guide, tokenizer), and on cache hit
+    we make a shallow copy to reuse the same underlying FSM.
+    """
+    global global_thread_pool
+    guide, mode = _get_guide_and_mode(guided_params)
+    if not guide or not mode:
+        return None
+
+    if global_thread_pool is None:
+        global_thread_pool = concurrent.futures.ThreadPoolExecutor(
+            max_workers=2)
+    loop = asyncio.get_running_loop()
+
+    return await loop.run_in_executor(global_thread_pool,
+                                      _get_logits_processor, guide, tokenizer,
+                                      mode, guided_params.whitespace_pattern)
+
+
+def get_local_outlines_guided_decoding_logits_processor(
+    guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizerBase
+) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor,
+           None]:
+    """
+    Given an OpenAI-compatible request, check for guided decoding parameters
+    and get the necessary logits processor for the given guide.
+    We cache logit processors by (guide, tokenizer), and on cache hit
+    we make a shallow copy to reuse the same underlying FSM.
+    """
+    guide, mode = _get_guide_and_mode(guided_params)
+    if not guide or not mode:
+        return None
+
+    return _get_logits_processor(guide, tokenizer, mode,
+                                 guided_params.whitespace_pattern)
+
+
+def _get_guide_and_mode(
+    guided_params: GuidedDecodingParams
+) -> Union[Tuple[str, GuidedDecodingMode], Tuple[None, None]]:
+    if guided_params.json:
+        if isinstance(guided_params.json, dict):
+            # turn dict into hashable string
+            json = json_dumps(guided_params.json)
+        else:
+            json = guided_params.json
+        return json, GuidedDecodingMode.JSON
+    elif guided_params.regex:
+        return guided_params.regex, GuidedDecodingMode.REGEX
+    elif guided_params.choice:
+        # choice just uses regex
+        choices = [
+            regex_escape(str(choice)) for choice in guided_params.choice
+        ]
+        choices_regex = "(" + "|".join(choices) + ")"
+        return choices_regex, GuidedDecodingMode.CHOICE
+    elif guided_params.grammar:
+        return guided_params.grammar, GuidedDecodingMode.GRAMMAR
+    elif guided_params.json_object:
+        return JSON_GRAMMAR, GuidedDecodingMode.GRAMMAR
+    else:
+        return None, None
+
+
+def _get_logits_processor(
+    guide: str, tokenizer: PreTrainedTokenizerBase, mode: GuidedDecodingMode,
+    whitespace_pattern: Union[str, None]
+) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor]:
+    if mode == GuidedDecodingMode.JSON:
+        return JSONLogitsProcessor(guide, tokenizer, whitespace_pattern)
+    elif mode == GuidedDecodingMode.REGEX or mode == GuidedDecodingMode.CHOICE:
+        return RegexLogitsProcessor(guide, tokenizer)
+    elif mode == GuidedDecodingMode.GRAMMAR:
+        return CFGLogitsProcessor(guide, tokenizer)
+    else:
+        raise ValueError(f"Unknown guided decoding mode {mode}")
--- a/vllm-v0.6.2/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm-v0.6.2/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -0,0 +1,222 @@
+# Copyright 2024- the Outlines developers
+# This file is adapted from
+# https://github.com/outlines-dev/outlines/blob/main/outlines/serve/vllm.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import json
+from collections import defaultdict
+from functools import lru_cache
+from typing import Callable, DefaultDict, Dict, List, Union
+
+import numpy as np
+import torch
+from lark import Lark
+from outlines import grammars
+from outlines.caching import cache
+from outlines.fsm.guide import CFGGuide, Generate, Guide, RegexGuide, Write
+from outlines.fsm.json_schema import build_regex_from_schema
+from pydantic import BaseModel
+from transformers import PreTrainedTokenizerBase
+
+
+class BaseLogitsProcessor:
+
+    def __init__(self, guide: Guide):
+        self._guide: Guide = guide
+        self._fsm_state: DefaultDict[int, int] = defaultdict(int)
+
+    def __call__(self, input_ids: List[int],
+                 scores: torch.Tensor) -> torch.Tensor:
+        """Use the FSM to bias the logits before sampling the next token."""
+        seq_id = hash(tuple(input_ids))
+
+        if len(input_ids) > 0:
+            last_token = input_ids[-1]
+            last_seq_id = hash(tuple(input_ids[:-1]))
+            self._fsm_state[seq_id] = self._guide.get_next_state(
+                state=self._fsm_state[last_seq_id], token_id=last_token)
+        else:
+            # Note: this is a hack.
+            # Lark pickling does not work properly (silent failure),
+            # which breaks the RPC (which uses python pickleing).
+            # We need to find a better solution.
+            # On the first time this is called, we simply re-create
+            # the Lark object.
+            if isinstance(self._guide, CFGGuide):
+                self._guide.parser = Lark(
+                    self._guide.cfg_string,
+                    parser="lalr",
+                    lexer="contextual",
+                    propagate_positions=False,
+                    maybe_placeholders=False,
+                    regex=True,
+                    import_paths=[grammars.GRAMMAR_PATH],
+                )
+
+        instruction = self._guide.get_next_instruction(
+            state=self._fsm_state[seq_id])
+
+        if type(instruction) == Generate:  # noqa: E721
+            allowed_tokens = instruction.tokens
+        elif type(instruction) == Write:  # noqa: E721
+            # TODO: support fast forward tokens
+            allowed_tokens = [instruction.tokens[0]]
+        else:
+            raise TypeError(
+                f"Unsupported instruction type {type(instruction)}")
+
+        mask = torch.full((scores.shape[-1], ),
+                          -torch.inf,
+                          device=scores.device)
+        # The tokenizer may support more token ids than the model can generate,
+        # eg. Llama 3.2 Vision models have an `<|image|>` token with id 128256
+        # but scores.shape == torch.Size([128256])
+        # Using NumPy is faster for filtering token ids
+        allowed_tokens = np.array(allowed_tokens, dtype=np.int64)
+        allowed_tokens = torch.tensor(allowed_tokens, device=scores.device)
+        allowed_tokens = allowed_tokens.masked_select(
+            allowed_tokens < scores.shape[-1])
+        mask.index_fill_(0, allowed_tokens, 0)
+        scores.add_(mask)
+        return scores
+
+
+class RegexLogitsProcessor(BaseLogitsProcessor):
+
+    @classmethod
+    @cache()
+    def _get_guide(cls, regex_string: str,
+                   tokenizer: PreTrainedTokenizerBase) -> Guide:
+        tokenizer = _adapt_tokenizer(tokenizer)
+        return RegexGuide(regex_string, tokenizer)
+
+    def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase):
+        """Compile the FSM that drives the regex-structured generation.
+
+        Parameters
+        ----------
+        regex_string
+            A string that represents a regular expression
+        tokenizer
+            The model's tokenizer
+
+        """
+        super().__init__(
+            RegexLogitsProcessor._get_guide(regex_string, tokenizer))
+
+
+class JSONLogitsProcessor(RegexLogitsProcessor):
+
+    def __init__(self, schema: Union[str, Dict, BaseModel],
+                 tokenizer: PreTrainedTokenizerBase,
+                 whitespace_pattern: Union[str, None]):
+        """Compile the FSM that drives the JSON-guided generation.
+
+        Parameters
+        ----------
+        schema
+            A JSON schema that encodes the structure we want the model to
+            generate
+        tokenizer
+            The model's tokenizer
+        whitespace_pattern
+            Pattern to use for JSON syntactic whitespace (doesn't impact
+            string literals)
+            Example: allow only a single space or newline with
+            `whitespace_pattern=r"[\n ]?"`
+        """
+        if isinstance(schema, type(BaseModel)):
+            schema_str = json.dumps(schema.model_json_schema())
+        elif isinstance(schema, Dict):
+            schema_str = json.dumps(schema)
+        elif isinstance(schema, str):
+            schema_str = schema
+        else:
+            raise ValueError(
+                f"Cannot parse schema {schema}. The schema must be either "
+                f"a Pydantic object, a dictionary or a string that contains "
+                f"the JSON Schema specification")
+        regex_string = build_regex_from_schema(schema_str, whitespace_pattern)
+        super().__init__(regex_string, tokenizer)
+
+
+class CFGLogitsProcessor(BaseLogitsProcessor):
+
+    @classmethod
+    @cache()
+    def _get_guide(cls, cfg: str, tokenizer: PreTrainedTokenizerBase) -> Guide:
+        tokenizer = _adapt_tokenizer(tokenizer)
+        return CFGGuide(cfg, tokenizer)
+
+    def __init__(self, cfg: str, tokenizer: PreTrainedTokenizerBase):
+        """Compile the FSM that drives the context free grammar generation.
+
+        Parameters
+        ----------
+        cfg
+            A string that represents a context-free grammar
+        tokenizer
+            The model's tokenizer
+
+        """
+        super().__init__(CFGLogitsProcessor._get_guide(cfg, tokenizer))
+        self._guide = self._guide.copy()
+
+
+@lru_cache(maxsize=32)
+def _adapt_tokenizer(tokenizer: PreTrainedTokenizerBase):
+    """Adapt vLLM's tokenizer to use to compile the FSM.
+
+    The API of Outlines tokenizers is slightly different to that of
+    `transformers`. The decoder of outlines, returns a list whereas
+    the decode of vLLM returns an str. To sync the vLLM decoder with
+    outlines internal api, the decoder should be adapted. In addition
+    we need to handle the missing spaces to Llama's tokenizer to be
+    able to compile FSMs for this model.
+
+    """
+    if getattr(tokenizer, "_outlines_adapted", False):
+        return tokenizer
+
+    tokenizer = copy.deepcopy(tokenizer)
+
+    tokenizer.vocabulary = tokenizer.get_vocab()
+    tokenizer.special_tokens = set(tokenizer.all_special_tokens)
+
+    def convert_token_to_string(token: str) -> str:
+        from transformers.file_utils import SPIECE_UNDERLINE
+
+        string = tokenizer.convert_tokens_to_string([token])
+
+        # A hack to handle missing spaces to HF's Llama tokenizers
+        if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>":
+            return " " + string
+
+        return string
+
+    def change_decoder(
+        decoder: Callable[[List[int]],
+                          str]) -> Callable[[List[int]], List[str]]:
+        """Sync vLLM's decoder with the outlines by returning list."""
+
+        def new_decoder(inp_tokens: List[int]) -> List[str]:
+            return [decoder(inp_tokens)]
+
+        return new_decoder
+
+    tokenizer.convert_token_to_string = convert_token_to_string
+    tokenizer.decode = change_decoder(tokenizer.decode)
+    setattr(tokenizer, "_outlines_adapted", True)  # noqa: B010
+
+    return tokenizer