"""vLLM: a high-throughput and memory-efficient inference engine for LLMs""" import os os.environ['PYTORCH_CNDEV_BASED_MLU_CHECK'] = '1' os.environ['CN_NOTIFIER_POOL_MAX'] = "1000" from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.llm_engine import LLMEngine from vllm.entrypoints.llm import LLM from vllm.executor.ray_utils import initialize_ray_cluster from vllm.inputs import PromptType, TextPrompt, TokensPrompt from vllm.model_executor.models import ModelRegistry from vllm.outputs import (CompletionOutput, EmbeddingOutput, EmbeddingRequestOutput, RequestOutput) from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from .version import (__version__, __version_tuple__, __vllm_mlu_version__, __torch_version__) from vllm.platforms import current_platform if current_platform.is_mlu(): try: import vllm_mlu print("\033[0;32mApply vllm_mlu success, running in performance version !\033[0m") except ModuleNotFoundError: print("\033[0;31mApply vllm_mlu failed, running in basic version !\033[0m") except Exception as e: print("\033[0;31mApply vllm_mlu failed!\033[0m") raise Exception(e) __version__ = f"{__version__}+mlu{__vllm_mlu_version__}.pt{__torch_version__}" __all__ = [ "__version__", "__version_tuple__", "LLM", "ModelRegistry", "PromptType", "TextPrompt", "TokensPrompt", "SamplingParams", "RequestOutput", "CompletionOutput", "EmbeddingOutput", "EmbeddingRequestOutput", "LLMEngine", "EngineArgs", "AsyncLLMEngine", "AsyncEngineArgs", "initialize_ray_cluster", "PoolingParams", ]