49 lines
2.1 KiB
Python
49 lines
2.1 KiB
Python
from vllm.engine.arg_utils import EngineArgs
|
|
from vllm.usage.usage_lib import UsageContext
|
|
from typing import Dict, Optional
|
|
from vllm.engine.metrics_types import StatLoggerBase
|
|
|
|
class LLMEngine:
|
|
|
|
@classmethod
|
|
def from_engine_args(
|
|
cls,
|
|
engine_args: EngineArgs,
|
|
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
|
stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
|
|
) -> "LLMEngine":
|
|
"""Creates an LLM engine from the engine arguments."""
|
|
# Create the engine configs.
|
|
vllm_config = engine_args.create_engine_config(usage_context)
|
|
#patch to prevent num_speculative_tokens > 1
|
|
speculative_mode = hasattr(vllm_config, 'speculative_config')
|
|
if speculative_mode and \
|
|
hasattr(vllm_config.speculative_config, 'num_speculative_tokens') and \
|
|
vllm_config.speculative_config.num_speculative_tokens != 1:
|
|
raise ValueError(f'run_mp_engine: only support num_speculative_tokens == 1, but get {vllm_config.speculative_config.num_speculative_tokens}')
|
|
|
|
default_model_infos = "default"
|
|
if speculative_mode:
|
|
if hasattr(vllm_config.speculative_config, 'method'):
|
|
default_model_infos = vllm_config.speculative_config.method
|
|
|
|
from vllm_vacc.vllm.config_manager import vllm_vacc_config_manager
|
|
vllm_vacc_config_manager().update_model_infos(default_model_infos)
|
|
|
|
import vllm.envs as envs
|
|
engine_cls = None
|
|
if envs.VLLM_USE_V1:
|
|
from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
|
|
engine_cls = V1LLMEngine
|
|
else:
|
|
from vllm.engine.llm_engine import LLMEngine as DefaultEngine
|
|
engine_cls = DefaultEngine
|
|
|
|
assert engine_cls is not None, f"LLMEngine is empty: {engine_cls}"
|
|
|
|
return engine_cls.from_vllm_config(
|
|
vllm_config=vllm_config,
|
|
usage_context=usage_context,
|
|
stat_loggers=stat_loggers,
|
|
disable_log_stats=engine_args.disable_log_stats,
|
|
) |