"""A VACC worker class.""" import gc import os from typing import Dict, List, Optional, Set, Tuple, Type, Union from importlib import util from typing import Optional import torch from vllm import envs from vllm.config import VllmConfig from vllm.distributed.parallel_state import get_pp_group, get_tp_group from vllm.logger import init_logger from vllm.model_executor.utils import set_random_seed from vllm.sequence import IntermediateTensors from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment, set_custom_all_reduce) from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType, get_dtype_size) from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.outputs import ModelRunnerOutput, AsyncModelRunnerOutput # from vllm.v1.worker.cpu_model_runner import VACCModelRunner from vllm_vacc.vllm.v1.worker.vacc_model_runner import VACCModelRunner from vllm.v1.worker.gpu_worker import Worker from vllm.v1.kv_cache_interface import KVCacheConfig # from vllm.worker.cache_engine import CacheEngine from vllm_vacc.vllm.model_executor.models.vars import BLOCK_GROUP_SIZE as env_blk_grp_size logger = init_logger(__name__) from vllm.utils import GiB_bytes TP_GROUP_ID = 1234 def generate_rank_info_list(): global TP_GROUP_ID from vllm.distributed import get_tp_group # generate ran get_tp_group().generate_rank_device_infos() get_tp_group().generate_group_id(TP_GROUP_ID) def generate_tp_group_id(): global TP_GROUP_ID from pathlib import Path import uuid workspace_path = Path.cwd() bootinfo_config = f'{workspace_path}/.bootinfos' bootinfo_inited = os.path.exists(bootinfo_config) current_bootinfos = "default" if bootinfo_inited: try: with open(bootinfo_config) as w: current_bootinfos = w.readline() except Exception as e: print("[WARN] bootinfo load fail ", e) if current_bootinfos is not None: unique_value = uuid.uuid5(uuid.NAMESPACE_URL, current_bootinfos).int int32_value = unique_value & 0xFFFFFFFF if int32_value >= 2**31: int32_value -= 2**32 TP_GROUP_ID = int32_value # print("current_bootinfos:", current_bootinfos, TP_GROUP_ID) def init_worker_distributed_environment( vllm_config: VllmConfig, rank: int, distributed_init_method: Optional[str] = None, local_rank: int = -1, backend: str = "vccl", ) -> None: """Initialize the distributed environment.""" parallel_config = vllm_config.parallel_config set_custom_all_reduce(not parallel_config.disable_custom_all_reduce) init_distributed_environment(parallel_config.world_size, rank, distributed_init_method, local_rank, backend) ensure_model_parallel_initialized( parallel_config.tensor_parallel_size, parallel_config.pipeline_parallel_size, parallel_config.decode_context_parallel_size) ensure_kv_transfer_initialized(vllm_config) generate_tp_group_id() generate_rank_info_list() def get_cache_block_size( cache_config: CacheConfig, model_config: ModelConfig, parallel_config: ParallelConfig, ) -> int: head_size = model_config.get_head_size() num_heads = model_config.get_num_kv_heads(parallel_config) num_attention_layers = model_config.get_num_layers_by_block_type( parallel_config, LayerBlockType.attention) if cache_config.cache_dtype == "auto": dtype = model_config.dtype else: dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype] key_cache_entry = num_heads * head_size # For MLA there is no value cache, since the latent vector # is joint keys and values. value_cache_entry = key_cache_entry if not model_config.use_mla else 0 total = num_attention_layers * cache_config.block_size * \ (key_cache_entry + value_cache_entry) dtype_size = get_dtype_size(dtype) return dtype_size * total class VACCWorker(Worker): def __init__(self, vllm_config: VllmConfig, local_rank: int, rank: int, distributed_init_method: str, is_driver_worker: bool = False): super().__init__(vllm_config, local_rank, rank, distributed_init_method, is_driver_worker=is_driver_worker) self.parallel_config.disable_custom_all_reduce = True def init_device(self) -> None: if self.device_config.device.type == "vacc": try: self.device = torch.device(f"vacc:{self.local_rank}") torch.vacc.set_device(self.device) gc.collect() torch.vacc.empty_cache() except Exception as e: raise RuntimeError( f"device init fail: {e} ", f"self.device: {self.device}, check /dev/* or VACC_VISIBLE_DEVICES") else: raise RuntimeError( f"Not support device type: {self.device_config.device}") # Initialize the distributed environment. init_worker_distributed_environment(self.vllm_config, self.rank, self.distributed_init_method, self.local_rank) # Set random seed. set_random_seed(self.model_config.seed) # Construct the model runner self.model_runner: VACCModelRunner = VACCModelRunner( self.vllm_config, self.device) def sleep(self, level: int = 1) -> None: logger.warning("sleep mode is not supported on VACC, ignore it.") pass def wake_up(self, tags: Optional[list[str]] = None) -> None: logger.warning("sleep mode is not supported on VACC, ignore it.") pass def get_cache_block_size_bytes(self) -> int: """Get the size of the KV cache block size in bytes. """ return get_cache_block_size(self.cache_config, self.model_config, self.parallel_config) def determine_available_memory(self) -> int: """Determine the number of available KV blocks. Swapping is not yet supported, so always return num_cpu_blocks=0. We configure num_gpu_blocks to be equal to max_num_seqs. """ available_kv_cache_memory_min, num_gpu_blocks = self.determine_available_memory_block() return available_kv_cache_memory_min def determine_available_memory_block(self) -> int: """Determine the number of available KV blocks. Swapping is not yet supported, so always return num_cpu_blocks=0. We configure num_gpu_blocks to be equal to max_num_seqs. """ available_kv_cache_memory= int(os.getenv("VLLM_VACC_KVCACHE_SPACE", "16")) * GiB_bytes max_seq_num = int(os.getenv("MAX_SEQ_NUM", 4)) max_num_gpu_blocks=0 if available_kv_cache_memory ==0: torch.vacc.empty_cache() torch.vacc.reset_peak_memory_stats() total_memory = torch.vacc.mem_get_info()[1] self.model_runner.profile_run() torch.vacc.synchronize() peak_memory = torch.vacc.max_memory_allocated() torch.vacc.empty_cache() torch_allocated_bytes = torch.vacc.memory_stats( )["allocated_bytes.all.current"] total_allocated_bytes = torch.vacc.mem_get_info( )[1] - torch.vacc.mem_get_info()[0] non_torch_allocations = total_allocated_bytes - torch_allocated_bytes if non_torch_allocations > 0: peak_memory += non_torch_allocations available_kv_cache_memory=total_memory*self.cache_config.gpu_memory_utilization - peak_memory if self.model_config.hf_config.model_type == "deepseek_v3": assert self.model_config.max_model_len <= 65536*self.vllm_config.parallel_config.pipeline_parallel_size, f"unsupported max model len, should less equal 65536 but got {self.model_config.max_model_len}" # Rules: # 1. always reserve N * 8K blocks # 2. no less than (MAX_SEQ_NUM + 1) * 8K blocks minimum_num_gpu_blocks_required = (max_seq_num + 1) * env_blk_grp_size // self.cache_config.block_size max_model_len = (self.model_config.max_model_len + env_blk_grp_size - 1) // env_blk_grp_size * env_blk_grp_size max_num_gpu_blocks = max_model_len // self.cache_config.block_size # limited by available_kv_cache_memory cache_block_size = self.get_cache_block_size_bytes() if cache_block_size == 0: num_gpu_blocks = 0 num_cpu_blocks = 0 else: num_gpu_blocks = int(available_kv_cache_memory // cache_block_size) num_cpu_blocks = int(self.cache_config.swap_space_bytes // cache_block_size) assert num_gpu_blocks >= minimum_num_gpu_blocks_required, \ f"num_gpu_blocks should >= {minimum_num_gpu_blocks_required} please increase VLLM_VACC_KVCACHE_SPACE" torch.vacc.empty_cache() # if self.model_runner.lora_manager: # self.model_runner.remove_all_loras() gc.collect() if max_num_gpu_blocks != 0: num_gpu_blocks = min(max_num_gpu_blocks, num_gpu_blocks) num_gpu_blocks = max(num_gpu_blocks, minimum_num_gpu_blocks_required) available_kv_cache_memory_min = num_gpu_blocks * cache_block_size return available_kv_cache_memory_min, num_gpu_blocks def compile_or_warm_up_model(self) -> None: # Reset the seed to ensure that the random state is not affected by # the model initialization and profiling. set_random_seed(self.model_config.seed) # self.model_runner.warming_up_model() @torch.inference_mode() def execute_model( self, scheduler_output: "SchedulerOutput", ) -> Optional[Union[ModelRunnerOutput, AsyncModelRunnerOutput]]: intermediate_tensors = None forward_pass = scheduler_output.total_num_scheduled_tokens > 0 if forward_pass and not get_pp_group().is_first_rank: intermediate_tensors = IntermediateTensors( get_pp_group().recv_tensor_dict( all_gather_group=get_tp_group())) output = self.model_runner.execute_model(scheduler_output, intermediate_tensors) if isinstance(output, (ModelRunnerOutput, AsyncModelRunnerOutput)): return output if not get_pp_group().is_last_rank: assert isinstance(output, IntermediateTensors) get_pp_group().send_tensor_dict(output.tensors, all_gather_group=get_tp_group()) return None assert isinstance(output, ModelRunnerOutput) return output if self.is_driver_worker else None def get_cpus_id_binding_based_on_numa_nodes(self) -> str: """Return VACCs id binding based on NUMA nodes. """ rank_to_cpus = self.local_omp_cpuid # Setup OpenMP thread affinity based on NUMA nodes automatically world_size = self.vllm_config.parallel_config.world_size libnuma_found = util.find_spec("numa") is not None psutil_found = util.find_spec("psutil") is not None if libnuma_found and psutil_found: import psutil from numa import info cpu_count = psutil.cpu_count(logical=False) cpus_allow_list = psutil.Process().cpu_affinity() numa_size = info.get_num_configured_nodes() cpu_count_per_numa = cpu_count // numa_size num_of_reserved_cpu = min(envs.VLLM_VACC_NUM_OF_RESERVED_VACC, cpu_count_per_numa // 2) # check allow node_to_cpus list node_to_cpus = [] for i in range(numa_size): node_intersect = set( info.node_to_cpus(i)).intersection(cpus_allow_list) if bool(node_intersect): node_to_cpus.append(list(node_intersect)) if world_size > len(node_to_cpus): logger.error( "Auto thread-binding failed due to " "world size: %d is larger than " "allowed NUMA nodes number: %d." "Please try to bind threads manually.", world_size, len(node_to_cpus)) else: end = cpu_count_per_numa - num_of_reserved_cpu rank_to_cpus_list = node_to_cpus[self.rank][:end] rank_to_cpus = ','.join(str(x) for x in rank_to_cpus_list) logger.info("auto thread-binding list: %s", rank_to_cpus) else: logger.warning( "Auto thread-binding is not supported due to " "the lack of package numa and psutil," "fallback to no thread-binding. To get better performance," "please try to manually bind threads.") return rank_to_cpus def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None: """Allocate GPU KV cache with the specified kv_cache_config.""" if self.vllm_config.model_config.enable_sleep_mode: allocator = CuMemAllocator.get_instance() context = allocator.use_memory_pool(tag="kv_cache") else: from contextlib import nullcontext context = nullcontext() with context: self.model_runner.initialize_kv_cache(kv_cache_config)