Iluvatar-mrv100 SDK 4.3.0
This commit is contained in:
56
vllm/transformers_utils/tokenizer_group/__init__.py
Normal file
56
vllm/transformers_utils/tokenizer_group/__init__.py
Normal file
@@ -0,0 +1,56 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from typing import Optional, Type
|
||||
|
||||
from vllm.config import (LoRAConfig, ModelConfig, ParallelConfig,
|
||||
SchedulerConfig, TokenizerPoolConfig)
|
||||
from vllm.executor.ray_utils import ray
|
||||
|
||||
from .base_tokenizer_group import AnyTokenizer, BaseTokenizerGroup
|
||||
from .tokenizer_group import TokenizerGroup
|
||||
|
||||
if ray:
|
||||
from .ray_tokenizer_group import RayTokenizerGroupPool
|
||||
else:
|
||||
RayTokenizerGroupPool = None # type: ignore
|
||||
|
||||
|
||||
def init_tokenizer_from_configs(model_config: ModelConfig,
|
||||
scheduler_config: SchedulerConfig,
|
||||
parallel_config: ParallelConfig,
|
||||
lora_config: Optional[LoRAConfig]):
|
||||
init_kwargs = dict(tokenizer_id=model_config.tokenizer,
|
||||
enable_lora=bool(lora_config),
|
||||
max_num_seqs=scheduler_config.max_num_seqs,
|
||||
max_loras=lora_config.max_loras if lora_config else 0,
|
||||
max_input_length=None,
|
||||
tokenizer_mode=model_config.tokenizer_mode,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
revision=model_config.tokenizer_revision,
|
||||
truncation_side=model_config.truncation_side)
|
||||
|
||||
return get_tokenizer_group(parallel_config.tokenizer_pool_config,
|
||||
**init_kwargs)
|
||||
|
||||
|
||||
def get_tokenizer_group(tokenizer_pool_config: Optional[TokenizerPoolConfig],
|
||||
**init_kwargs) -> BaseTokenizerGroup:
|
||||
tokenizer_cls: Type[BaseTokenizerGroup]
|
||||
if tokenizer_pool_config is None:
|
||||
tokenizer_cls = TokenizerGroup
|
||||
elif isinstance(tokenizer_pool_config.pool_type, type) and issubclass(
|
||||
tokenizer_pool_config.pool_type, BaseTokenizerGroup):
|
||||
tokenizer_cls = tokenizer_pool_config.pool_type
|
||||
elif tokenizer_pool_config.pool_type == "ray":
|
||||
if RayTokenizerGroupPool is None:
|
||||
raise ImportError(
|
||||
"RayTokenizerGroupPool is not available. Please install "
|
||||
"the ray package to use the Ray tokenizer group pool.")
|
||||
tokenizer_cls = RayTokenizerGroupPool
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unknown pool type: {tokenizer_pool_config.pool_type}")
|
||||
return tokenizer_cls.from_config(tokenizer_pool_config, **init_kwargs)
|
||||
|
||||
|
||||
__all__ = ["AnyTokenizer", "get_tokenizer_group", "BaseTokenizerGroup"]
|
||||
@@ -0,0 +1,68 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Optional
|
||||
|
||||
from vllm.config import TokenizerPoolConfig
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
|
||||
|
||||
class BaseTokenizerGroup(ABC):
|
||||
"""A group of tokenizers that can be used for LoRA adapters."""
|
||||
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
|
||||
**init_kwargs) -> "BaseTokenizerGroup":
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def ping(self) -> bool:
|
||||
"""Check if the tokenizer group is alive."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_max_input_len(
|
||||
self,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
) -> Optional[int]:
|
||||
"""Get the maximum input length for the LoRA request."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def encode(self,
|
||||
prompt: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
add_special_tokens: Optional[bool] = None) -> List[int]:
|
||||
"""Encode a prompt using the tokenizer group."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def encode_async(
|
||||
self,
|
||||
prompt: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
add_special_tokens: Optional[bool] = None) -> List[int]:
|
||||
"""Encode a prompt using the tokenizer group."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_lora_tokenizer(
|
||||
self,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
) -> AnyTokenizer:
|
||||
"""Get a tokenizer for a LoRA request."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def get_lora_tokenizer_async(
|
||||
self,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
) -> AnyTokenizer:
|
||||
"""Get a tokenizer for a LoRA request."""
|
||||
pass
|
||||
|
||||
def check_health(self):
|
||||
"""Raise exception if the tokenizer group is unhealthy."""
|
||||
return
|
||||
244
vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
Normal file
244
vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
Normal file
@@ -0,0 +1,244 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
from typing import List, Optional
|
||||
|
||||
try:
|
||||
from ray.exceptions import ActorDiedError # type: ignore
|
||||
except ImportError:
|
||||
# For older versions of Ray
|
||||
from ray.exceptions import RayActorError as ActorDiedError # type: ignore
|
||||
from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
|
||||
|
||||
from vllm.config import TokenizerPoolConfig
|
||||
from vllm.executor.ray_utils import ray
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
|
||||
from .base_tokenizer_group import BaseTokenizerGroup
|
||||
from .tokenizer_group import TokenizerGroup
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class RayTokenizerGroupPool(BaseTokenizerGroup):
|
||||
"""A Ray-based pool of TokenizerGroups for async tokenization."""
|
||||
|
||||
# Class to use for workers making up the pool.
|
||||
_worker_cls = TokenizerGroup
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
|
||||
**init_kwargs) -> "RayTokenizerGroupPool":
|
||||
if not tokenizer_pool_config:
|
||||
raise ValueError("tokenizer_pool_config must not be None.")
|
||||
ray_actor_options = (tokenizer_pool_config.extra_config or {
|
||||
"num_cpus": 0
|
||||
})
|
||||
ray_actor_options.setdefault(
|
||||
"scheduling_strategy",
|
||||
NodeAffinitySchedulingStrategy(
|
||||
node_id=ray.get_runtime_context().get_node_id(), soft=True))
|
||||
|
||||
# Carry over the env vars to the actors.
|
||||
# This is necessary for API keys and such.
|
||||
ray_actor_options.setdefault("runtime_env", {})
|
||||
_carry_over_env_vars_to_runtime_env(ray_actor_options["runtime_env"])
|
||||
|
||||
init_kwargs["num_actors"] = tokenizer_pool_config.pool_size
|
||||
init_kwargs["ray_actor_options"] = ray_actor_options
|
||||
|
||||
return cls(**init_kwargs)
|
||||
|
||||
def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
|
||||
max_input_length: Optional[int], num_actors: int,
|
||||
ray_actor_options: dict, **tokenizer_config):
|
||||
# Store a local copy of the TokenizerGroup for quick access
|
||||
# to underlying HF tokenizers.
|
||||
self._tokenizer_config = {
|
||||
"tokenizer_id": tokenizer_id,
|
||||
"enable_lora": enable_lora,
|
||||
"max_num_seqs": max_num_seqs,
|
||||
"max_input_length": max_input_length,
|
||||
**tokenizer_config
|
||||
}
|
||||
self._local_tokenizer_group = self._worker_cls(
|
||||
**self._tokenizer_config, )
|
||||
|
||||
self._ray_tokenizer_group_cls = ray.remote(
|
||||
self._worker_cls).options(**ray_actor_options) # type: ignore
|
||||
self.tokenizer_actors = [self._init_actor() for _ in range(num_actors)]
|
||||
self._idle_actors: Optional[asyncio.Queue] = None
|
||||
|
||||
# If set, actor is unhealthy. Will reraise on the next
|
||||
# check_health call.
|
||||
self._exception: Optional[ActorDiedError] = None
|
||||
|
||||
def _init_actor(self) -> ray.ObjectRef:
|
||||
return self._ray_tokenizer_group_cls.remote(**self._tokenizer_config)
|
||||
|
||||
@property
|
||||
def pool_size(self) -> int:
|
||||
return len(self.tokenizer_actors)
|
||||
|
||||
def ping(self):
|
||||
return ray.get([
|
||||
actor.ping.remote() # type: ignore
|
||||
for actor in self.tokenizer_actors
|
||||
])
|
||||
|
||||
def _ensure_queue_initialized(self):
|
||||
if self._idle_actors is None:
|
||||
self._idle_actors = asyncio.Queue()
|
||||
for actor in self.tokenizer_actors:
|
||||
self._idle_actors.put_nowait(actor)
|
||||
|
||||
def _finalize_encode(self, actor: ray.ObjectRef,
|
||||
original_actor: ray.ObjectRef, actor_is_alive: bool):
|
||||
assert self._idle_actors is not None
|
||||
# Cleanup the dead actor.
|
||||
if not actor_is_alive or original_actor is not actor:
|
||||
self.tokenizer_actors.remove(original_actor)
|
||||
if actor_is_alive:
|
||||
# Put the actor back in the queue.
|
||||
# This is done in a finally block to ensure that the actor is
|
||||
# always put back in the queue, even if an exception/cancellation
|
||||
# is raised.
|
||||
self._idle_actors.put_nowait(actor)
|
||||
# Add back the new actor.
|
||||
if original_actor is not actor:
|
||||
self.tokenizer_actors.append(actor)
|
||||
|
||||
def encode(self,
|
||||
prompt: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
add_special_tokens: Optional[bool] = None) -> List[int]:
|
||||
"""Encode a prompt using the tokenizer group.
|
||||
|
||||
We pick an idle actor and use it to encode the prompt.
|
||||
The actor is then put back in the queue for future use.
|
||||
This is blocking.
|
||||
"""
|
||||
self.check_health()
|
||||
self._ensure_queue_initialized()
|
||||
assert self._idle_actors is not None
|
||||
|
||||
if self._idle_actors.empty():
|
||||
raise RuntimeError("No idle actors available.")
|
||||
actor = self._idle_actors.get_nowait()
|
||||
actor_is_alive = True
|
||||
original_actor = actor
|
||||
try:
|
||||
ret = ray.get(
|
||||
actor.encode.remote(prompt=prompt,
|
||||
lora_request=lora_request,
|
||||
add_special_tokens=add_special_tokens))
|
||||
except ActorDiedError as e:
|
||||
# If the actor is dead, we first try to reinitialize it.
|
||||
logger.warning("%s died with ActorDiedError, reinitializing.",
|
||||
actor,
|
||||
exc_info=e)
|
||||
actor = self._init_actor()
|
||||
try:
|
||||
ret = ray.get(
|
||||
actor.encode.remote(prompt=prompt,
|
||||
lora_request=lora_request,
|
||||
add_special_tokens=add_special_tokens))
|
||||
except ActorDiedError as e:
|
||||
logger.error(
|
||||
"%s died for second time in a row, marking "
|
||||
"RayTokenizerGroupPool as unhealthy.", actor)
|
||||
actor_is_alive = False
|
||||
if not self._exception:
|
||||
self._exception = e
|
||||
self.check_health()
|
||||
finally:
|
||||
self._finalize_encode(actor, original_actor, actor_is_alive)
|
||||
return ret
|
||||
|
||||
async def encode_async(
|
||||
self,
|
||||
prompt: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
add_special_tokens: Optional[bool] = None) -> List[int]:
|
||||
"""Encode a prompt using the tokenizer group.
|
||||
|
||||
We pick an idle actor and use it to encode the prompt.
|
||||
If there are no idle actors, we wait until one becomes
|
||||
available.
|
||||
The actor is then put back in the queue for future use.
|
||||
This is non-blocking.
|
||||
"""
|
||||
self.check_health()
|
||||
self._ensure_queue_initialized()
|
||||
assert self._idle_actors is not None
|
||||
|
||||
actor = await self._idle_actors.get()
|
||||
actor_is_alive = True
|
||||
original_actor = actor
|
||||
try:
|
||||
ret = await actor.encode.remote(
|
||||
prompt=prompt,
|
||||
lora_request=lora_request,
|
||||
add_special_tokens=add_special_tokens)
|
||||
except ActorDiedError as e:
|
||||
# If the actor is dead, we first try to reinitialize it.
|
||||
logger.warning("%s died with ActorDiedError, reinitializing.",
|
||||
actor,
|
||||
exc_info=e)
|
||||
actor = self._init_actor()
|
||||
try:
|
||||
ret = await actor.encode.remote(
|
||||
prompt=prompt,
|
||||
lora_request=lora_request,
|
||||
add_special_tokens=add_special_tokens)
|
||||
except ActorDiedError as e:
|
||||
logger.error(
|
||||
"%s died for second time in a row, marking "
|
||||
"RayTokenizerGroupPool as unhealthy.", actor)
|
||||
actor_is_alive = False
|
||||
if not self._exception:
|
||||
self._exception = e
|
||||
self.check_health()
|
||||
finally:
|
||||
self._finalize_encode(actor, original_actor, actor_is_alive)
|
||||
return ret
|
||||
|
||||
def get_max_input_len(self,
|
||||
lora_request: Optional[LoRARequest] = None
|
||||
) -> Optional[int]:
|
||||
"""Get the maximum input length for the LoRA request."""
|
||||
return self._local_tokenizer_group.get_max_input_len(lora_request)
|
||||
|
||||
def get_lora_tokenizer(
|
||||
self,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
) -> AnyTokenizer:
|
||||
return self._local_tokenizer_group.get_lora_tokenizer(lora_request)
|
||||
|
||||
async def get_lora_tokenizer_async(
|
||||
self,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
) -> AnyTokenizer:
|
||||
return await self._local_tokenizer_group.get_lora_tokenizer_async(
|
||||
lora_request)
|
||||
|
||||
def check_health(self):
|
||||
if self._exception:
|
||||
raise RuntimeError(
|
||||
"TokenizerGroupPool is unhealthy.") from self._exception
|
||||
|
||||
|
||||
def _carry_over_env_vars_to_runtime_env(runtime_env: dict) -> None:
|
||||
"""Copy over all current process environment variables to the runtime_env.
|
||||
|
||||
The variables in runtime_env will take precedence over the current process
|
||||
environment variables.
|
||||
|
||||
runtime_env will be modified in place."""
|
||||
env_vars = os.environ.copy()
|
||||
runtime_env.setdefault("env_vars", {})
|
||||
env_vars.update(runtime_env["env_vars"])
|
||||
runtime_env["env_vars"] = env_vars
|
||||
106
vllm/transformers_utils/tokenizer_group/tokenizer_group.py
Normal file
106
vllm/transformers_utils/tokenizer_group/tokenizer_group.py
Normal file
@@ -0,0 +1,106 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from typing import List, Optional
|
||||
|
||||
from vllm.config import TokenizerPoolConfig
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens,
|
||||
get_lora_tokenizer,
|
||||
get_lora_tokenizer_async,
|
||||
get_tokenizer)
|
||||
from vllm.utils import LRUCache
|
||||
|
||||
from .base_tokenizer_group import BaseTokenizerGroup
|
||||
|
||||
|
||||
class TokenizerGroup(BaseTokenizerGroup):
|
||||
"""A group of tokenizers that can be used for LoRA adapters."""
|
||||
|
||||
def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
|
||||
max_input_length: Optional[int], **tokenizer_config):
|
||||
self.tokenizer_id = tokenizer_id
|
||||
self.tokenizer_config = tokenizer_config
|
||||
self.enable_lora = enable_lora
|
||||
self.max_input_length = max_input_length
|
||||
self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config)
|
||||
max_loras = tokenizer_config.get("max_loras", 0)
|
||||
self.lora_tokenizers = LRUCache[int, AnyTokenizer](
|
||||
capacity=max(max_loras, max_num_seqs) if enable_lora else 0)
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
|
||||
**init_kwargs) -> "TokenizerGroup":
|
||||
return cls(**init_kwargs)
|
||||
|
||||
def ping(self) -> bool:
|
||||
"""Check if the tokenizer group is alive."""
|
||||
return True
|
||||
|
||||
def get_max_input_len(self,
|
||||
lora_request: Optional[LoRARequest] = None
|
||||
) -> Optional[int]:
|
||||
"""Get the maximum input length for the LoRA request."""
|
||||
return self.max_input_length
|
||||
|
||||
def _raise_if_input_too_long(self,
|
||||
encoded_tokens: List[int],
|
||||
lora_request: Optional[LoRARequest] = None):
|
||||
input_length = len(encoded_tokens)
|
||||
if lora_request:
|
||||
max_input_length = (lora_request.long_lora_max_len
|
||||
or self.max_input_length)
|
||||
else:
|
||||
max_input_length = self.max_input_length
|
||||
if max_input_length is not None and input_length > max_input_length:
|
||||
raise ValueError("Input too long.", input_length, max_input_length)
|
||||
|
||||
def encode(self,
|
||||
prompt: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
add_special_tokens: Optional[bool] = None) -> List[int]:
|
||||
tokenizer = self.get_lora_tokenizer(lora_request)
|
||||
ret = encode_tokens(tokenizer,
|
||||
prompt,
|
||||
add_special_tokens=add_special_tokens)
|
||||
self._raise_if_input_too_long(ret, lora_request)
|
||||
return ret
|
||||
|
||||
async def encode_async(
|
||||
self,
|
||||
prompt: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
add_special_tokens: Optional[bool] = None) -> List[int]:
|
||||
tokenizer = await self.get_lora_tokenizer_async(lora_request)
|
||||
ret = encode_tokens(tokenizer,
|
||||
prompt,
|
||||
add_special_tokens=add_special_tokens)
|
||||
self._raise_if_input_too_long(ret, lora_request)
|
||||
return ret
|
||||
|
||||
def get_lora_tokenizer(
|
||||
self,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
) -> AnyTokenizer:
|
||||
if not lora_request or not self.enable_lora:
|
||||
return self.tokenizer
|
||||
if lora_request.lora_int_id not in self.lora_tokenizers:
|
||||
tokenizer = (get_lora_tokenizer(
|
||||
lora_request, **self.tokenizer_config) or self.tokenizer)
|
||||
self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
|
||||
return tokenizer
|
||||
else:
|
||||
return self.lora_tokenizers[lora_request.lora_int_id]
|
||||
|
||||
async def get_lora_tokenizer_async(
|
||||
self,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
) -> AnyTokenizer:
|
||||
if not lora_request or not self.enable_lora:
|
||||
return self.tokenizer
|
||||
if lora_request.lora_int_id not in self.lora_tokenizers:
|
||||
tokenizer = (await get_lora_tokenizer_async(
|
||||
lora_request, **self.tokenizer_config) or self.tokenizer)
|
||||
self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
|
||||
return tokenizer
|
||||
else:
|
||||
return self.lora_tokenizers[lora_request.lora_int_id]
|
||||
Reference in New Issue
Block a user