enginex-vastai-va16-vllm/vllm_vacc/vllm/v1/worker/vacc_worker.py

"""A VACC worker class."""
import gc
import os
from typing import Dict, List, Optional, Set, Tuple, Type, Union

from importlib import util
from typing import Optional

import torch

from vllm import envs
from vllm.config import VllmConfig
from vllm.distributed.parallel_state import get_pp_group, get_tp_group
from vllm.logger import init_logger
from vllm.model_executor.utils import set_random_seed
from vllm.sequence import IntermediateTensors
from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig
from vllm.distributed import (ensure_model_parallel_initialized,
                              init_distributed_environment,
                              set_custom_all_reduce)
from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType,
                        get_dtype_size)

from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.outputs import ModelRunnerOutput, AsyncModelRunnerOutput
# from vllm.v1.worker.cpu_model_runner import VACCModelRunner
from vllm_vacc.vllm.v1.worker.vacc_model_runner import VACCModelRunner
from vllm.v1.worker.gpu_worker import Worker
from vllm.v1.kv_cache_interface import KVCacheConfig
# from vllm.worker.cache_engine import CacheEngine

from vllm_vacc.vllm.model_executor.models.vars import BLOCK_GROUP_SIZE as env_blk_grp_size


logger = init_logger(__name__)
from vllm.utils import GiB_bytes

TP_GROUP_ID = 1234
def generate_rank_info_list():
    global TP_GROUP_ID
    from vllm.distributed import get_tp_group
    # generate ran
    get_tp_group().generate_rank_device_infos()
    get_tp_group().generate_group_id(TP_GROUP_ID)

def generate_tp_group_id():
    global TP_GROUP_ID
    from pathlib import Path
    import uuid
    workspace_path = Path.cwd()

    bootinfo_config = f'{workspace_path}/.bootinfos'
    bootinfo_inited = os.path.exists(bootinfo_config)

    current_bootinfos = "default"
    if bootinfo_inited:
        try:
            with open(bootinfo_config) as w:
                current_bootinfos = w.readline()
        except Exception as e:
            print("[WARN] bootinfo load fail ", e)

    if current_bootinfos is not None:
        unique_value = uuid.uuid5(uuid.NAMESPACE_URL, current_bootinfos).int

        int32_value = unique_value & 0xFFFFFFFF
        if int32_value >= 2**31:
            int32_value -= 2**32
        TP_GROUP_ID = int32_value
        # print("current_bootinfos:", current_bootinfos, TP_GROUP_ID)

def init_worker_distributed_environment(
    vllm_config: VllmConfig,
    rank: int,
    distributed_init_method: Optional[str] = None,
    local_rank: int = -1,
    backend: str = "vccl",
) -> None:
    """Initialize the distributed environment."""
    parallel_config = vllm_config.parallel_config
    set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)

    init_distributed_environment(parallel_config.world_size, rank,
                                 distributed_init_method, local_rank, backend)

    ensure_model_parallel_initialized(
        parallel_config.tensor_parallel_size,
        parallel_config.pipeline_parallel_size,
        parallel_config.decode_context_parallel_size)

    ensure_kv_transfer_initialized(vllm_config)
    generate_tp_group_id()
    generate_rank_info_list()


def get_cache_block_size(
    cache_config: CacheConfig,
    model_config: ModelConfig,
    parallel_config: ParallelConfig,
) -> int:
    head_size = model_config.get_head_size()
    num_heads = model_config.get_num_kv_heads(parallel_config)
    num_attention_layers = model_config.get_num_layers_by_block_type(
        parallel_config, LayerBlockType.attention)

    if cache_config.cache_dtype == "auto":
        dtype = model_config.dtype
    else:
        dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]

    key_cache_entry = num_heads * head_size

    # For MLA there is no value cache, since the latent vector
    # is joint keys and values.
    value_cache_entry = key_cache_entry if not model_config.use_mla else 0
    total = num_attention_layers * cache_config.block_size * \
        (key_cache_entry + value_cache_entry)

    dtype_size = get_dtype_size(dtype)
    return dtype_size * total


class VACCWorker(Worker):

    def __init__(self,
                 vllm_config: VllmConfig,
                 local_rank: int,
                 rank: int,
                 distributed_init_method: str,
                 is_driver_worker: bool = False):
        super().__init__(vllm_config,
                         local_rank,
                         rank,
                         distributed_init_method,
                         is_driver_worker=is_driver_worker)

        self.parallel_config.disable_custom_all_reduce = True

    def init_device(self) -> None:
        if self.device_config.device.type == "vacc":
            try:
                self.device = torch.device(f"vacc:{self.local_rank}")
                torch.vacc.set_device(self.device)
                gc.collect()
                torch.vacc.empty_cache()
            except Exception as e:
                raise RuntimeError(
                f"device init fail: {e} ",
                f"self.device: {self.device}, check /dev/* or VACC_VISIBLE_DEVICES")
        else:
            raise RuntimeError(
                f"Not support device type: {self.device_config.device}")
        # Initialize the distributed environment.
        init_worker_distributed_environment(self.vllm_config, self.rank,
                                            self.distributed_init_method,
                                            self.local_rank)
        # Set random seed.
        set_random_seed(self.model_config.seed)

        # Construct the model runner
        self.model_runner: VACCModelRunner = VACCModelRunner(
            self.vllm_config, self.device)


    def sleep(self, level: int = 1) -> None:
        logger.warning("sleep mode is not supported on VACC, ignore it.")
        pass

    def wake_up(self, tags: Optional[list[str]] = None) -> None:
        logger.warning("sleep mode is not supported on VACC, ignore it.")
        pass

    def get_cache_block_size_bytes(self) -> int:
        """Get the size of the KV cache block size in bytes.
        """
        return get_cache_block_size(self.cache_config,
                                                self.model_config,
                                                self.parallel_config)
    def determine_available_memory(self) -> int:
        """Determine the number of available KV blocks.

        Swapping is not yet supported, so always return num_cpu_blocks=0.

        We configure num_gpu_blocks to be equal to max_num_seqs.
        """
        available_kv_cache_memory_min, num_gpu_blocks = self.determine_available_memory_block()
        return available_kv_cache_memory_min


    def determine_available_memory_block(self) -> int:
        """Determine the number of available KV blocks.

        Swapping is not yet supported, so always return num_cpu_blocks=0.

        We configure num_gpu_blocks to be equal to max_num_seqs.
        """
        available_kv_cache_memory= int(os.getenv("VLLM_VACC_KVCACHE_SPACE", "16")) * GiB_bytes

        max_seq_num = int(os.getenv("MAX_SEQ_NUM", 4))
        max_num_gpu_blocks=0
        if available_kv_cache_memory ==0:
            torch.vacc.empty_cache()
            torch.vacc.reset_peak_memory_stats()
            total_memory = torch.vacc.mem_get_info()[1]
            self.model_runner.profile_run()
            torch.vacc.synchronize()
            peak_memory = torch.vacc.max_memory_allocated()
            torch.vacc.empty_cache()
            torch_allocated_bytes = torch.vacc.memory_stats(
            )["allocated_bytes.all.current"]
            total_allocated_bytes = torch.vacc.mem_get_info(
            )[1] - torch.vacc.mem_get_info()[0]
            non_torch_allocations = total_allocated_bytes - torch_allocated_bytes
            if non_torch_allocations > 0:
                peak_memory += non_torch_allocations
            available_kv_cache_memory=total_memory*self.cache_config.gpu_memory_utilization - peak_memory

        if self.model_config.hf_config.model_type == "deepseek_v3":
            assert self.model_config.max_model_len <= 65536*self.vllm_config.parallel_config.pipeline_parallel_size, f"unsupported max model len, should less equal 65536 but got {self.model_config.max_model_len}"
        # Rules:
        # 1. always reserve N * 8K blocks
        # 2. no less than (MAX_SEQ_NUM + 1) * 8K blocks
        minimum_num_gpu_blocks_required = (max_seq_num + 1) * env_blk_grp_size // self.cache_config.block_size
        max_model_len = (self.model_config.max_model_len + env_blk_grp_size - 1) // env_blk_grp_size * env_blk_grp_size
        max_num_gpu_blocks = max_model_len // self.cache_config.block_size

        # limited by available_kv_cache_memory
        cache_block_size = self.get_cache_block_size_bytes()
        if cache_block_size == 0:
            num_gpu_blocks = 0
            num_cpu_blocks = 0
        else:
            num_gpu_blocks = int(available_kv_cache_memory // cache_block_size)
            num_cpu_blocks = int(self.cache_config.swap_space_bytes //
                                 cache_block_size)
            assert num_gpu_blocks >=  minimum_num_gpu_blocks_required, \
                f"num_gpu_blocks should >= {minimum_num_gpu_blocks_required} please increase VLLM_VACC_KVCACHE_SPACE"
        torch.vacc.empty_cache()
        # if self.model_runner.lora_manager:
        #     self.model_runner.remove_all_loras()
        gc.collect()
        if max_num_gpu_blocks != 0:
            num_gpu_blocks = min(max_num_gpu_blocks, num_gpu_blocks)

        num_gpu_blocks = max(num_gpu_blocks, minimum_num_gpu_blocks_required)
        available_kv_cache_memory_min = num_gpu_blocks * cache_block_size

        return available_kv_cache_memory_min, num_gpu_blocks

    def compile_or_warm_up_model(self) -> None:
        # Reset the seed to ensure that the random state is not affected by
        # the model initialization and profiling.
        set_random_seed(self.model_config.seed)
        # self.model_runner.warming_up_model()

    @torch.inference_mode()
    def execute_model(
        self,
        scheduler_output: "SchedulerOutput",
    ) -> Optional[Union[ModelRunnerOutput, AsyncModelRunnerOutput]]:
        intermediate_tensors = None
        forward_pass = scheduler_output.total_num_scheduled_tokens > 0
        if forward_pass and not get_pp_group().is_first_rank:
            intermediate_tensors = IntermediateTensors(
                get_pp_group().recv_tensor_dict(
                    all_gather_group=get_tp_group()))

        output = self.model_runner.execute_model(scheduler_output,
                                                 intermediate_tensors)

        if isinstance(output, (ModelRunnerOutput, AsyncModelRunnerOutput)):
            return output

        if not get_pp_group().is_last_rank:
            assert isinstance(output, IntermediateTensors)
            get_pp_group().send_tensor_dict(output.tensors,
                                            all_gather_group=get_tp_group())
            return None

        assert isinstance(output, ModelRunnerOutput)
        return output if self.is_driver_worker else None

    def get_cpus_id_binding_based_on_numa_nodes(self) -> str:
        """Return VACCs id binding based on NUMA nodes.
        """
        rank_to_cpus = self.local_omp_cpuid
        # Setup OpenMP thread affinity based on NUMA nodes automatically
        world_size = self.vllm_config.parallel_config.world_size
        libnuma_found = util.find_spec("numa") is not None
        psutil_found = util.find_spec("psutil") is not None
        if libnuma_found and psutil_found:
            import psutil
            from numa import info
            cpu_count = psutil.cpu_count(logical=False)
            cpus_allow_list = psutil.Process().cpu_affinity()
            numa_size = info.get_num_configured_nodes()
            cpu_count_per_numa = cpu_count // numa_size
            num_of_reserved_cpu = min(envs.VLLM_VACC_NUM_OF_RESERVED_VACC,
                                      cpu_count_per_numa // 2)

            # check allow node_to_cpus list
            node_to_cpus = []
            for i in range(numa_size):
                node_intersect = set(
                    info.node_to_cpus(i)).intersection(cpus_allow_list)
                if bool(node_intersect):
                    node_to_cpus.append(list(node_intersect))

            if world_size > len(node_to_cpus):
                logger.error(
                    "Auto thread-binding failed due to "
                    "world size: %d is larger than "
                    "allowed NUMA nodes number: %d."
                    "Please try to bind threads manually.", world_size,
                    len(node_to_cpus))
            else:
                end = cpu_count_per_numa - num_of_reserved_cpu
                rank_to_cpus_list = node_to_cpus[self.rank][:end]
                rank_to_cpus = ','.join(str(x) for x in rank_to_cpus_list)
                logger.info("auto thread-binding list: %s", rank_to_cpus)
        else:
            logger.warning(
                "Auto thread-binding is not supported due to "
                "the lack of package numa and psutil,"
                "fallback to no thread-binding. To get better performance,"
                "please try to manually bind threads.")
        return rank_to_cpus


    def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
        """Allocate GPU KV cache with the specified kv_cache_config."""
        if self.vllm_config.model_config.enable_sleep_mode:
            allocator = CuMemAllocator.get_instance()
            context = allocator.use_memory_pool(tag="kv_cache")
        else:
            from contextlib import nullcontext
            context = nullcontext()
        with context:
            self.model_runner.initialize_kv_cache(kv_cache_config)