enginex-vastai-va16-vllm/vllm_vacc/platform.py

import os
from typing import TYPE_CHECKING, Optional

import torch

from vllm.logger import init_logger

# from .interface import Platform, PlatformEnum, _Backend
from vllm.platforms.interface import Platform, PlatformEnum, _Backend

if TYPE_CHECKING:
    from vllm.config import VllmConfig,ModelConfig
else:
    VllmConfig = None
    ModelConfig = None

logger = init_logger(__name__)


class VaccPlatform(Platform):
    try:
        import torch_vacc
        is_vacc = True
    except Exception as e:
        assert False, f"error import torch_vacc: {e}"
    _enum = PlatformEnum.OOT
    device_name: str = "vacc"
    device_type: str = "vacc"
    dispatch_key: str = "PrivateUse1"
    ray_device_key: str = "GPU"
    device_control_env_var: str = "VACC_VISIBLE_MODULES"
    simple_compile_backend: str = "eager"  # Disable torch.compile()

    @classmethod
    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
                             block_size: int, use_v1: bool,
                             use_mla: bool, has_sink: bool, use_sparse: bool) -> str:
        if use_mla:
            logger.info("Using VACCMLA backend.")
            if use_v1:
                return "vllm_vacc.vllm.v1.attention.backends.vacc_mla.VACCMLABackend"
            return "vllm_vacc.vllm.attention.backends.vacc_mla.VACCMLABackend"
        if use_v1:
            return "vllm_vacc.vllm.v1.attention.backends.vacc_attn.VACCAttentionBackend"
        else:
            logger.info("Using VACCAttention backend.")
            return "vllm_vacc.vllm.attention.backends.vacc_attn.VACCAttentionBackend"

    @classmethod
    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
        return True

    @staticmethod
    def inference_mode():
        return torch.no_grad()

    @classmethod
    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
        import vllm.envs as envs

        if vllm_config.kv_transfer_config:
            raise NotImplementedError("kv-transfer-config is not implemented for VACC")

        cache_config = vllm_config.cache_config
        scheduler_config = vllm_config.scheduler_config
        if ((scheduler_config.chunked_prefill_enabled
             or cache_config.enable_prefix_caching)
                and cache_config.cache_dtype != "auto"):
            raise RuntimeError("Chunked-prefill and prefix-cache on the Vacc "
                               "backend is not compatible with FP8 KV cache.")


        # scheduling_polity = scheduler_config.policy
        # model_config = vllm_config.model_config
        # use_async_output_proc = model_config.use_async_output_proc
        # if scheduling_polity == "priority" and use_async_output_proc:       # probably a bug
        #     logger.warning("WARNING scheduling_polity priority is not fully supported for VACC, "
        #                    "use fcfs instead automatically")
        #     vllm_config.scheduler_config.scheduling_polity = "fcfs"

        # if vllm_config.speculative_config is not None:
        #     raise NotImplementedError(
        #         "Speculative decoding is not implemented for VACC")

        parallel_config = vllm_config.parallel_config
        if parallel_config.worker_cls == "auto":
            if vllm_config.speculative_config:
                if envs.VLLM_USE_V1:
                    parallel_config.worker_cls = "vllm_vacc.vllm.v1.worker.vacc_worker.VACCWorker"
                else:
                    parallel_config.worker_cls = "vllm.spec_decode.spec_decode_worker.create_spec_worker"
                    parallel_config.sd_worker_cls = "vllm_vacc.vllm.worker.vacc_worker.VACCWorker"
            else:
                if envs.VLLM_USE_V1:
                    parallel_config.worker_cls = \
                        "vllm_vacc.vllm.v1.worker.vacc_worker.VACCWorker"
                    print('v1 VACCWorker')
                else:
                    parallel_config.worker_cls = \
                        "vllm_vacc.vllm.worker.vacc_worker.VACCWorker"


        # NOTE(kzawora): default block size for Gaudi should be 128
        # smaller sizes still work, but very inefficiently
        cache_config = vllm_config.cache_config
        if cache_config and cache_config.gpu_memory_utilization:
            logger.warning("WARNING gpu_memory_utilization is not supported on VACC")

        # if cache_config and cache_config.enable_prefix_caching:
        #     raise NotImplementedError("Prefix-caching is not implemented for VACC")

        if cache_config and cache_config.block_size is None:
            cache_config.block_size = 16
        if (parallel_config.distributed_executor_backend == 'mp'
                and envs.VLLM_WORKER_MULTIPROC_METHOD == 'fork'):
            if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD",
                              None) is not None:
                logger.warning("On VACC, VLLM_WORKER_MULTIPROC_METHOD=fork "
                               "might cause application hangs on exit. Using "
                               "VLLM_WORKER_MULTIPROC_METHOD=fork anyway, "
                               "as it was explicitly requested.")
            else:
                logger.warning(
                    "On VACC, VLLM_WORKER_MULTIPROC_METHOD=fork "
                    "might cause application hangs on exit. Setting "
                    "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
                    "To override that behavior, please set "
                    "VLLM_WORKER_MULTIPROC_METHOD=fork explicitly.")
                os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

    @classmethod
    def is_pin_memory_available(cls):
        logger.warning("Pin memory is not supported on VACC.")
        return False

    @classmethod
    def get_punica_wrapper(cls) -> str:
        return "vllm_vacc.vllm.lora.punica_wrapper.punica_vacc.PunicaWrapperVACC"

    @classmethod
    def get_current_memory_usage(cls,
                                 device: Optional[torch.types.Device] = None
                                 ) -> float:
        torch.vacc.reset_peak_memory_stats(device)
        return torch.vacc.max_memory_allocated(device)

    @classmethod
    def use_all_gather(cls) -> bool:
        """
        Whether to use allgather in LogitsProcessor to gather the logits.
        """
        return True

    @classmethod
    def supports_v1(cls, model_config: ModelConfig) -> bool:
        """Returns whether the current platform can support v1 for the supplied
        model configuration.
        """
        # return False # or export VLLM_USE_V1=0 to use v0
        if os.getenv("VLLM_USE_V1", 1) == '0':
            return False
        return True