forked from EngineX-Cambricon/enginex-mlu370-vllm
59 lines
1.8 KiB
Python
59 lines
1.8 KiB
Python
"""vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
|
|
|
|
import os
|
|
os.environ['PYTORCH_CNDEV_BASED_MLU_CHECK'] = '1'
|
|
os.environ['CN_NOTIFIER_POOL_MAX'] = "1000"
|
|
|
|
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
|
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
|
from vllm.engine.llm_engine import LLMEngine
|
|
from vllm.entrypoints.llm import LLM
|
|
from vllm.executor.ray_utils import initialize_ray_cluster
|
|
from vllm.inputs import PromptType, TextPrompt, TokensPrompt
|
|
from vllm.model_executor.models import ModelRegistry
|
|
from vllm.outputs import (CompletionOutput, EmbeddingOutput,
|
|
EmbeddingRequestOutput, RequestOutput)
|
|
from vllm.pooling_params import PoolingParams
|
|
from vllm.sampling_params import SamplingParams
|
|
|
|
from .version import (__version__, __version_tuple__,
|
|
__vllm_mlu_version__, __torch_version__)
|
|
|
|
|
|
from vllm.platforms import current_platform
|
|
|
|
if current_platform.is_mlu():
|
|
try:
|
|
import vllm_mlu
|
|
print("\033[0;32mApply vllm_mlu success, running in performance version !\033[0m")
|
|
except ModuleNotFoundError:
|
|
print("\033[0;31mApply vllm_mlu failed, running in basic version !\033[0m")
|
|
except Exception as e:
|
|
print("\033[0;31mApply vllm_mlu failed!\033[0m")
|
|
raise Exception(e)
|
|
|
|
|
|
__version__ = f"{__version__}+mlu{__vllm_mlu_version__}.pt{__torch_version__}"
|
|
|
|
|
|
__all__ = [
|
|
"__version__",
|
|
"__version_tuple__",
|
|
"LLM",
|
|
"ModelRegistry",
|
|
"PromptType",
|
|
"TextPrompt",
|
|
"TokensPrompt",
|
|
"SamplingParams",
|
|
"RequestOutput",
|
|
"CompletionOutput",
|
|
"EmbeddingOutput",
|
|
"EmbeddingRequestOutput",
|
|
"LLMEngine",
|
|
"EngineArgs",
|
|
"AsyncLLMEngine",
|
|
"AsyncEngineArgs",
|
|
"initialize_ray_cluster",
|
|
"PoolingParams",
|
|
]
|