forked from EngineX-Cambricon/enginex-mlu370-vllm
add qwen3
This commit is contained in:
46
vllm-v0.6.2/vllm/model_executor/guided_decoding/__init__.py
Normal file
46
vllm-v0.6.2/vllm/model_executor/guided_decoding/__init__.py
Normal file
@@ -0,0 +1,46 @@
|
||||
from typing import Optional
|
||||
|
||||
from vllm.logits_process import LogitsProcessor
|
||||
from vllm.sampling_params import GuidedDecodingParams
|
||||
|
||||
|
||||
async def get_guided_decoding_logits_processor(
|
||||
guided_params: GuidedDecodingParams,
|
||||
tokenizer) -> Optional[LogitsProcessor]:
|
||||
# CFG grammar not supported by LMFE, so we use outlines instead
|
||||
if guided_params.backend == 'outlines' or guided_params.grammar:
|
||||
# NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
|
||||
from vllm.model_executor.guided_decoding.outlines_decoding import ( # noqa
|
||||
get_outlines_guided_decoding_logits_processor)
|
||||
return await get_outlines_guided_decoding_logits_processor(
|
||||
guided_params, tokenizer)
|
||||
if guided_params.backend == 'lm-format-enforcer':
|
||||
from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import ( # noqa
|
||||
get_local_lm_format_enforcer_guided_decoding_logits_processor)
|
||||
return get_local_lm_format_enforcer_guided_decoding_logits_processor(
|
||||
guided_params, tokenizer)
|
||||
|
||||
raise ValueError(
|
||||
f"Unknown guided decoding backend '{guided_params.backend}'. "
|
||||
"Must be one of 'outlines, 'lm-format-enforcer'")
|
||||
|
||||
|
||||
def get_local_guided_decoding_logits_processor(
|
||||
guided_params: GuidedDecodingParams,
|
||||
tokenizer) -> Optional[LogitsProcessor]:
|
||||
# CFG grammar not supported by LMFE, so we use outlines instead
|
||||
if guided_params.backend == 'outlines' or guided_params.grammar:
|
||||
# NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
|
||||
from vllm.model_executor.guided_decoding.outlines_decoding import ( # noqa
|
||||
get_local_outlines_guided_decoding_logits_processor)
|
||||
return get_local_outlines_guided_decoding_logits_processor(
|
||||
guided_params, tokenizer)
|
||||
if guided_params.backend == 'lm-format-enforcer':
|
||||
from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import ( # noqa
|
||||
get_local_lm_format_enforcer_guided_decoding_logits_processor)
|
||||
return get_local_lm_format_enforcer_guided_decoding_logits_processor(
|
||||
guided_params, tokenizer)
|
||||
|
||||
raise ValueError(
|
||||
f"Unknown guided decoding backend '{guided_params.backend}'. "
|
||||
"Must be one of 'outlines, 'lm-format-enforcer'")
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,39 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, List, Optional, TypedDict, Union
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
# These classes are deprecated, see SamplingParams
|
||||
class LLMGuidedOptions(TypedDict, total=False):
|
||||
guided_json: Union[Dict, BaseModel, str]
|
||||
guided_regex: str
|
||||
guided_choice: List[str]
|
||||
guided_grammar: str
|
||||
guided_decoding_backend: str
|
||||
guided_whitespace_pattern: str
|
||||
guided_json_object: bool
|
||||
|
||||
|
||||
@dataclass
|
||||
class GuidedDecodingRequest:
|
||||
"""One of the fields will be used to retrieve the logit processor."""
|
||||
guided_json: Optional[Union[Dict, BaseModel, str]] = None
|
||||
guided_regex: Optional[str] = None
|
||||
guided_choice: Optional[List[str]] = None
|
||||
guided_grammar: Optional[str] = None
|
||||
guided_decoding_backend: Optional[str] = None
|
||||
guided_whitespace_pattern: Optional[str] = None
|
||||
guided_json_object: Optional[bool] = None
|
||||
|
||||
def __post_init__(self):
|
||||
"""Validate that some fields are mutually exclusive."""
|
||||
guide_count = sum([
|
||||
self.guided_json is not None, self.guided_regex is not None,
|
||||
self.guided_choice is not None, self.guided_grammar is not None,
|
||||
self.guided_json_object is not None
|
||||
])
|
||||
if guide_count > 1:
|
||||
raise ValueError(
|
||||
"You can only use one kind of guided decoding but multiple are "
|
||||
f"specified: {self.__dict__}")
|
||||
@@ -0,0 +1,64 @@
|
||||
from functools import lru_cache
|
||||
from json import loads as json_loads
|
||||
from typing import Optional, Union
|
||||
|
||||
from lmformatenforcer import (CharacterLevelParser, JsonSchemaParser,
|
||||
RegexParser, StringParser,
|
||||
TokenEnforcerTokenizerData, UnionParser)
|
||||
from lmformatenforcer.integrations.vllm import (
|
||||
build_vllm_logits_processor, build_vllm_token_enforcer_tokenizer_data)
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
from vllm.logits_process import LogitsProcessor
|
||||
from vllm.sampling_params import GuidedDecodingParams
|
||||
|
||||
|
||||
def get_local_lm_format_enforcer_guided_decoding_logits_processor(
|
||||
guided_params: GuidedDecodingParams,
|
||||
tokenizer) -> Optional[LogitsProcessor]:
|
||||
"""
|
||||
Given an OpenAI-compatible request, check for guided decoding parameters
|
||||
and get the necessary logits processor for the given guide.
|
||||
We cache logit processors by (guide, tokenizer), and on cache hit
|
||||
we make a shallow copy to reuse the same underlying FSM.
|
||||
"""
|
||||
|
||||
tokenizer_data = _cached_build_vllm_token_enforcer_tokenizer_data(
|
||||
tokenizer)
|
||||
character_level_parser: CharacterLevelParser
|
||||
if guided_params.json:
|
||||
schema_dict = _normalize_json_schema_object(guided_params.json)
|
||||
character_level_parser = JsonSchemaParser(schema_dict)
|
||||
elif guided_params.choice:
|
||||
character_level_parser = UnionParser(
|
||||
[StringParser(choice) for choice in guided_params.choice])
|
||||
elif guided_params.regex:
|
||||
character_level_parser = RegexParser(guided_params.regex)
|
||||
elif guided_params.grammar:
|
||||
# CFG grammar not supported by LMFE
|
||||
raise ValueError("Cannot construct a guided decoding logits processor"
|
||||
" using the grammar option with the"
|
||||
" lm_format_enforcer backend.")
|
||||
elif guided_params.json_object:
|
||||
# None means any json object
|
||||
character_level_parser = JsonSchemaParser(None)
|
||||
else:
|
||||
return None
|
||||
|
||||
logits_processor = build_vllm_logits_processor(tokenizer_data,
|
||||
character_level_parser)
|
||||
return logits_processor
|
||||
|
||||
|
||||
def _normalize_json_schema_object(schema: Union[str, dict]) -> dict:
|
||||
if isinstance(schema, str):
|
||||
return json_loads(schema)
|
||||
if isinstance(schema, dict):
|
||||
return schema
|
||||
raise AssertionError(f"Unsupported schema type {schema}")
|
||||
|
||||
|
||||
@lru_cache
|
||||
def _cached_build_vllm_token_enforcer_tokenizer_data(
|
||||
tokenizer: PreTrainedTokenizerBase) -> TokenEnforcerTokenizerData:
|
||||
return build_vllm_token_enforcer_tokenizer_data(tokenizer)
|
||||
@@ -0,0 +1,133 @@
|
||||
import asyncio
|
||||
import concurrent.futures
|
||||
from enum import Enum
|
||||
from json import dumps as json_dumps
|
||||
from re import escape as regex_escape
|
||||
from typing import Tuple, Union
|
||||
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
from vllm.model_executor.guided_decoding.outlines_logits_processors import (
|
||||
CFGLogitsProcessor, JSONLogitsProcessor, RegexLogitsProcessor)
|
||||
from vllm.sampling_params import GuidedDecodingParams
|
||||
|
||||
|
||||
class GuidedDecodingMode(Enum):
|
||||
JSON = "json"
|
||||
REGEX = "regex"
|
||||
CHOICE = "choice"
|
||||
GRAMMAR = "grammar"
|
||||
|
||||
|
||||
# https://github.com/outlines-dev/outlines/blob/main/outlines/grammars/json.lark
|
||||
# the main difference is that we changed the start: value to
|
||||
# start: object | array, so we are denying scalar values as the root of the
|
||||
# JSON. Starting with scalars as the root seems to cause llama to generate
|
||||
# without stop.
|
||||
JSON_GRAMMAR = r"""
|
||||
?start: object | array
|
||||
|
||||
?value: object
|
||||
| array
|
||||
| UNESCAPED_STRING
|
||||
| SIGNED_NUMBER -> number
|
||||
| "true" -> true
|
||||
| "false" -> false
|
||||
| "null" -> null
|
||||
|
||||
array : "[" [value ("," value)*] "]"
|
||||
object : "{" [pair ("," pair)*] "}"
|
||||
pair : UNESCAPED_STRING ":" value
|
||||
|
||||
%import common.UNESCAPED_STRING
|
||||
%import common.SIGNED_NUMBER
|
||||
%import common.WS
|
||||
|
||||
%ignore WS
|
||||
"""
|
||||
|
||||
global_thread_pool = None # used for generating logits processor fsm
|
||||
|
||||
|
||||
async def get_outlines_guided_decoding_logits_processor(
|
||||
guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizerBase
|
||||
) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor,
|
||||
None]:
|
||||
"""
|
||||
Given an OpenAI-compatible request, check for guided decoding parameters
|
||||
and get the necessary logits processor for the given guide.
|
||||
We cache logit processors by (guide, tokenizer), and on cache hit
|
||||
we make a shallow copy to reuse the same underlying FSM.
|
||||
"""
|
||||
global global_thread_pool
|
||||
guide, mode = _get_guide_and_mode(guided_params)
|
||||
if not guide or not mode:
|
||||
return None
|
||||
|
||||
if global_thread_pool is None:
|
||||
global_thread_pool = concurrent.futures.ThreadPoolExecutor(
|
||||
max_workers=2)
|
||||
loop = asyncio.get_running_loop()
|
||||
|
||||
return await loop.run_in_executor(global_thread_pool,
|
||||
_get_logits_processor, guide, tokenizer,
|
||||
mode, guided_params.whitespace_pattern)
|
||||
|
||||
|
||||
def get_local_outlines_guided_decoding_logits_processor(
|
||||
guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizerBase
|
||||
) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor,
|
||||
None]:
|
||||
"""
|
||||
Given an OpenAI-compatible request, check for guided decoding parameters
|
||||
and get the necessary logits processor for the given guide.
|
||||
We cache logit processors by (guide, tokenizer), and on cache hit
|
||||
we make a shallow copy to reuse the same underlying FSM.
|
||||
"""
|
||||
guide, mode = _get_guide_and_mode(guided_params)
|
||||
if not guide or not mode:
|
||||
return None
|
||||
|
||||
return _get_logits_processor(guide, tokenizer, mode,
|
||||
guided_params.whitespace_pattern)
|
||||
|
||||
|
||||
def _get_guide_and_mode(
|
||||
guided_params: GuidedDecodingParams
|
||||
) -> Union[Tuple[str, GuidedDecodingMode], Tuple[None, None]]:
|
||||
if guided_params.json:
|
||||
if isinstance(guided_params.json, dict):
|
||||
# turn dict into hashable string
|
||||
json = json_dumps(guided_params.json)
|
||||
else:
|
||||
json = guided_params.json
|
||||
return json, GuidedDecodingMode.JSON
|
||||
elif guided_params.regex:
|
||||
return guided_params.regex, GuidedDecodingMode.REGEX
|
||||
elif guided_params.choice:
|
||||
# choice just uses regex
|
||||
choices = [
|
||||
regex_escape(str(choice)) for choice in guided_params.choice
|
||||
]
|
||||
choices_regex = "(" + "|".join(choices) + ")"
|
||||
return choices_regex, GuidedDecodingMode.CHOICE
|
||||
elif guided_params.grammar:
|
||||
return guided_params.grammar, GuidedDecodingMode.GRAMMAR
|
||||
elif guided_params.json_object:
|
||||
return JSON_GRAMMAR, GuidedDecodingMode.GRAMMAR
|
||||
else:
|
||||
return None, None
|
||||
|
||||
|
||||
def _get_logits_processor(
|
||||
guide: str, tokenizer: PreTrainedTokenizerBase, mode: GuidedDecodingMode,
|
||||
whitespace_pattern: Union[str, None]
|
||||
) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor]:
|
||||
if mode == GuidedDecodingMode.JSON:
|
||||
return JSONLogitsProcessor(guide, tokenizer, whitespace_pattern)
|
||||
elif mode == GuidedDecodingMode.REGEX or mode == GuidedDecodingMode.CHOICE:
|
||||
return RegexLogitsProcessor(guide, tokenizer)
|
||||
elif mode == GuidedDecodingMode.GRAMMAR:
|
||||
return CFGLogitsProcessor(guide, tokenizer)
|
||||
else:
|
||||
raise ValueError(f"Unknown guided decoding mode {mode}")
|
||||
@@ -0,0 +1,222 @@
|
||||
# Copyright 2024- the Outlines developers
|
||||
# This file is adapted from
|
||||
# https://github.com/outlines-dev/outlines/blob/main/outlines/serve/vllm.py
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import copy
|
||||
import json
|
||||
from collections import defaultdict
|
||||
from functools import lru_cache
|
||||
from typing import Callable, DefaultDict, Dict, List, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from lark import Lark
|
||||
from outlines import grammars
|
||||
from outlines.caching import cache
|
||||
from outlines.fsm.guide import CFGGuide, Generate, Guide, RegexGuide, Write
|
||||
from outlines.fsm.json_schema import build_regex_from_schema
|
||||
from pydantic import BaseModel
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
|
||||
class BaseLogitsProcessor:
|
||||
|
||||
def __init__(self, guide: Guide):
|
||||
self._guide: Guide = guide
|
||||
self._fsm_state: DefaultDict[int, int] = defaultdict(int)
|
||||
|
||||
def __call__(self, input_ids: List[int],
|
||||
scores: torch.Tensor) -> torch.Tensor:
|
||||
"""Use the FSM to bias the logits before sampling the next token."""
|
||||
seq_id = hash(tuple(input_ids))
|
||||
|
||||
if len(input_ids) > 0:
|
||||
last_token = input_ids[-1]
|
||||
last_seq_id = hash(tuple(input_ids[:-1]))
|
||||
self._fsm_state[seq_id] = self._guide.get_next_state(
|
||||
state=self._fsm_state[last_seq_id], token_id=last_token)
|
||||
else:
|
||||
# Note: this is a hack.
|
||||
# Lark pickling does not work properly (silent failure),
|
||||
# which breaks the RPC (which uses python pickleing).
|
||||
# We need to find a better solution.
|
||||
# On the first time this is called, we simply re-create
|
||||
# the Lark object.
|
||||
if isinstance(self._guide, CFGGuide):
|
||||
self._guide.parser = Lark(
|
||||
self._guide.cfg_string,
|
||||
parser="lalr",
|
||||
lexer="contextual",
|
||||
propagate_positions=False,
|
||||
maybe_placeholders=False,
|
||||
regex=True,
|
||||
import_paths=[grammars.GRAMMAR_PATH],
|
||||
)
|
||||
|
||||
instruction = self._guide.get_next_instruction(
|
||||
state=self._fsm_state[seq_id])
|
||||
|
||||
if type(instruction) == Generate: # noqa: E721
|
||||
allowed_tokens = instruction.tokens
|
||||
elif type(instruction) == Write: # noqa: E721
|
||||
# TODO: support fast forward tokens
|
||||
allowed_tokens = [instruction.tokens[0]]
|
||||
else:
|
||||
raise TypeError(
|
||||
f"Unsupported instruction type {type(instruction)}")
|
||||
|
||||
mask = torch.full((scores.shape[-1], ),
|
||||
-torch.inf,
|
||||
device=scores.device)
|
||||
# The tokenizer may support more token ids than the model can generate,
|
||||
# eg. Llama 3.2 Vision models have an `<|image|>` token with id 128256
|
||||
# but scores.shape == torch.Size([128256])
|
||||
# Using NumPy is faster for filtering token ids
|
||||
allowed_tokens = np.array(allowed_tokens, dtype=np.int64)
|
||||
allowed_tokens = torch.tensor(allowed_tokens, device=scores.device)
|
||||
allowed_tokens = allowed_tokens.masked_select(
|
||||
allowed_tokens < scores.shape[-1])
|
||||
mask.index_fill_(0, allowed_tokens, 0)
|
||||
scores.add_(mask)
|
||||
return scores
|
||||
|
||||
|
||||
class RegexLogitsProcessor(BaseLogitsProcessor):
|
||||
|
||||
@classmethod
|
||||
@cache()
|
||||
def _get_guide(cls, regex_string: str,
|
||||
tokenizer: PreTrainedTokenizerBase) -> Guide:
|
||||
tokenizer = _adapt_tokenizer(tokenizer)
|
||||
return RegexGuide(regex_string, tokenizer)
|
||||
|
||||
def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase):
|
||||
"""Compile the FSM that drives the regex-structured generation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
regex_string
|
||||
A string that represents a regular expression
|
||||
tokenizer
|
||||
The model's tokenizer
|
||||
|
||||
"""
|
||||
super().__init__(
|
||||
RegexLogitsProcessor._get_guide(regex_string, tokenizer))
|
||||
|
||||
|
||||
class JSONLogitsProcessor(RegexLogitsProcessor):
|
||||
|
||||
def __init__(self, schema: Union[str, Dict, BaseModel],
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
whitespace_pattern: Union[str, None]):
|
||||
"""Compile the FSM that drives the JSON-guided generation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
schema
|
||||
A JSON schema that encodes the structure we want the model to
|
||||
generate
|
||||
tokenizer
|
||||
The model's tokenizer
|
||||
whitespace_pattern
|
||||
Pattern to use for JSON syntactic whitespace (doesn't impact
|
||||
string literals)
|
||||
Example: allow only a single space or newline with
|
||||
`whitespace_pattern=r"[\n ]?"`
|
||||
"""
|
||||
if isinstance(schema, type(BaseModel)):
|
||||
schema_str = json.dumps(schema.model_json_schema())
|
||||
elif isinstance(schema, Dict):
|
||||
schema_str = json.dumps(schema)
|
||||
elif isinstance(schema, str):
|
||||
schema_str = schema
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Cannot parse schema {schema}. The schema must be either "
|
||||
f"a Pydantic object, a dictionary or a string that contains "
|
||||
f"the JSON Schema specification")
|
||||
regex_string = build_regex_from_schema(schema_str, whitespace_pattern)
|
||||
super().__init__(regex_string, tokenizer)
|
||||
|
||||
|
||||
class CFGLogitsProcessor(BaseLogitsProcessor):
|
||||
|
||||
@classmethod
|
||||
@cache()
|
||||
def _get_guide(cls, cfg: str, tokenizer: PreTrainedTokenizerBase) -> Guide:
|
||||
tokenizer = _adapt_tokenizer(tokenizer)
|
||||
return CFGGuide(cfg, tokenizer)
|
||||
|
||||
def __init__(self, cfg: str, tokenizer: PreTrainedTokenizerBase):
|
||||
"""Compile the FSM that drives the context free grammar generation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cfg
|
||||
A string that represents a context-free grammar
|
||||
tokenizer
|
||||
The model's tokenizer
|
||||
|
||||
"""
|
||||
super().__init__(CFGLogitsProcessor._get_guide(cfg, tokenizer))
|
||||
self._guide = self._guide.copy()
|
||||
|
||||
|
||||
@lru_cache(maxsize=32)
|
||||
def _adapt_tokenizer(tokenizer: PreTrainedTokenizerBase):
|
||||
"""Adapt vLLM's tokenizer to use to compile the FSM.
|
||||
|
||||
The API of Outlines tokenizers is slightly different to that of
|
||||
`transformers`. The decoder of outlines, returns a list whereas
|
||||
the decode of vLLM returns an str. To sync the vLLM decoder with
|
||||
outlines internal api, the decoder should be adapted. In addition
|
||||
we need to handle the missing spaces to Llama's tokenizer to be
|
||||
able to compile FSMs for this model.
|
||||
|
||||
"""
|
||||
if getattr(tokenizer, "_outlines_adapted", False):
|
||||
return tokenizer
|
||||
|
||||
tokenizer = copy.deepcopy(tokenizer)
|
||||
|
||||
tokenizer.vocabulary = tokenizer.get_vocab()
|
||||
tokenizer.special_tokens = set(tokenizer.all_special_tokens)
|
||||
|
||||
def convert_token_to_string(token: str) -> str:
|
||||
from transformers.file_utils import SPIECE_UNDERLINE
|
||||
|
||||
string = tokenizer.convert_tokens_to_string([token])
|
||||
|
||||
# A hack to handle missing spaces to HF's Llama tokenizers
|
||||
if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>":
|
||||
return " " + string
|
||||
|
||||
return string
|
||||
|
||||
def change_decoder(
|
||||
decoder: Callable[[List[int]],
|
||||
str]) -> Callable[[List[int]], List[str]]:
|
||||
"""Sync vLLM's decoder with the outlines by returning list."""
|
||||
|
||||
def new_decoder(inp_tokens: List[int]) -> List[str]:
|
||||
return [decoder(inp_tokens)]
|
||||
|
||||
return new_decoder
|
||||
|
||||
tokenizer.convert_token_to_string = convert_token_to_string
|
||||
tokenizer.decode = change_decoder(tokenizer.decode)
|
||||
setattr(tokenizer, "_outlines_adapted", True) # noqa: B010
|
||||
|
||||
return tokenizer
|
||||
Reference in New Issue
Block a user