Files
enginex-vastai-va16-vllm/vllm_vacc/vllm/v1/worker/vacc_worker.py
2026-04-02 04:55:00 +00:00

340 lines
14 KiB
Python

"""A VACC worker class."""
import gc
import os
from typing import Dict, List, Optional, Set, Tuple, Type, Union
from importlib import util
from typing import Optional
import torch
from vllm import envs
from vllm.config import VllmConfig
from vllm.distributed.parallel_state import get_pp_group, get_tp_group
from vllm.logger import init_logger
from vllm.model_executor.utils import set_random_seed
from vllm.sequence import IntermediateTensors
from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig
from vllm.distributed import (ensure_model_parallel_initialized,
init_distributed_environment,
set_custom_all_reduce)
from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType,
get_dtype_size)
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.outputs import ModelRunnerOutput, AsyncModelRunnerOutput
# from vllm.v1.worker.cpu_model_runner import VACCModelRunner
from vllm_vacc.vllm.v1.worker.vacc_model_runner import VACCModelRunner
from vllm.v1.worker.gpu_worker import Worker
from vllm.v1.kv_cache_interface import KVCacheConfig
# from vllm.worker.cache_engine import CacheEngine
from vllm_vacc.vllm.model_executor.models.vars import BLOCK_GROUP_SIZE as env_blk_grp_size
logger = init_logger(__name__)
from vllm.utils import GiB_bytes
TP_GROUP_ID = 1234
def generate_rank_info_list():
global TP_GROUP_ID
from vllm.distributed import get_tp_group
# generate ran
get_tp_group().generate_rank_device_infos()
get_tp_group().generate_group_id(TP_GROUP_ID)
def generate_tp_group_id():
global TP_GROUP_ID
from pathlib import Path
import uuid
workspace_path = Path.cwd()
bootinfo_config = f'{workspace_path}/.bootinfos'
bootinfo_inited = os.path.exists(bootinfo_config)
current_bootinfos = "default"
if bootinfo_inited:
try:
with open(bootinfo_config) as w:
current_bootinfos = w.readline()
except Exception as e:
print("[WARN] bootinfo load fail ", e)
if current_bootinfos is not None:
unique_value = uuid.uuid5(uuid.NAMESPACE_URL, current_bootinfos).int
int32_value = unique_value & 0xFFFFFFFF
if int32_value >= 2**31:
int32_value -= 2**32
TP_GROUP_ID = int32_value
# print("current_bootinfos:", current_bootinfos, TP_GROUP_ID)
def init_worker_distributed_environment(
vllm_config: VllmConfig,
rank: int,
distributed_init_method: Optional[str] = None,
local_rank: int = -1,
backend: str = "vccl",
) -> None:
"""Initialize the distributed environment."""
parallel_config = vllm_config.parallel_config
set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
init_distributed_environment(parallel_config.world_size, rank,
distributed_init_method, local_rank, backend)
ensure_model_parallel_initialized(
parallel_config.tensor_parallel_size,
parallel_config.pipeline_parallel_size,
parallel_config.decode_context_parallel_size)
ensure_kv_transfer_initialized(vllm_config)
generate_tp_group_id()
generate_rank_info_list()
def get_cache_block_size(
cache_config: CacheConfig,
model_config: ModelConfig,
parallel_config: ParallelConfig,
) -> int:
head_size = model_config.get_head_size()
num_heads = model_config.get_num_kv_heads(parallel_config)
num_attention_layers = model_config.get_num_layers_by_block_type(
parallel_config, LayerBlockType.attention)
if cache_config.cache_dtype == "auto":
dtype = model_config.dtype
else:
dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
key_cache_entry = num_heads * head_size
# For MLA there is no value cache, since the latent vector
# is joint keys and values.
value_cache_entry = key_cache_entry if not model_config.use_mla else 0
total = num_attention_layers * cache_config.block_size * \
(key_cache_entry + value_cache_entry)
dtype_size = get_dtype_size(dtype)
return dtype_size * total
class VACCWorker(Worker):
def __init__(self,
vllm_config: VllmConfig,
local_rank: int,
rank: int,
distributed_init_method: str,
is_driver_worker: bool = False):
super().__init__(vllm_config,
local_rank,
rank,
distributed_init_method,
is_driver_worker=is_driver_worker)
self.parallel_config.disable_custom_all_reduce = True
def init_device(self) -> None:
if self.device_config.device.type == "vacc":
try:
self.device = torch.device(f"vacc:{self.local_rank}")
torch.vacc.set_device(self.device)
gc.collect()
torch.vacc.empty_cache()
except Exception as e:
raise RuntimeError(
f"device init fail: {e} ",
f"self.device: {self.device}, check /dev/* or VACC_VISIBLE_DEVICES")
else:
raise RuntimeError(
f"Not support device type: {self.device_config.device}")
# Initialize the distributed environment.
init_worker_distributed_environment(self.vllm_config, self.rank,
self.distributed_init_method,
self.local_rank)
# Set random seed.
set_random_seed(self.model_config.seed)
# Construct the model runner
self.model_runner: VACCModelRunner = VACCModelRunner(
self.vllm_config, self.device)
def sleep(self, level: int = 1) -> None:
logger.warning("sleep mode is not supported on VACC, ignore it.")
pass
def wake_up(self, tags: Optional[list[str]] = None) -> None:
logger.warning("sleep mode is not supported on VACC, ignore it.")
pass
def get_cache_block_size_bytes(self) -> int:
"""Get the size of the KV cache block size in bytes.
"""
return get_cache_block_size(self.cache_config,
self.model_config,
self.parallel_config)
def determine_available_memory(self) -> int:
"""Determine the number of available KV blocks.
Swapping is not yet supported, so always return num_cpu_blocks=0.
We configure num_gpu_blocks to be equal to max_num_seqs.
"""
available_kv_cache_memory_min, num_gpu_blocks = self.determine_available_memory_block()
return available_kv_cache_memory_min
def determine_available_memory_block(self) -> int:
"""Determine the number of available KV blocks.
Swapping is not yet supported, so always return num_cpu_blocks=0.
We configure num_gpu_blocks to be equal to max_num_seqs.
"""
available_kv_cache_memory= int(os.getenv("VLLM_VACC_KVCACHE_SPACE", "16")) * GiB_bytes
max_seq_num = int(os.getenv("MAX_SEQ_NUM", 4))
max_num_gpu_blocks=0
if available_kv_cache_memory ==0:
torch.vacc.empty_cache()
torch.vacc.reset_peak_memory_stats()
total_memory = torch.vacc.mem_get_info()[1]
self.model_runner.profile_run()
torch.vacc.synchronize()
peak_memory = torch.vacc.max_memory_allocated()
torch.vacc.empty_cache()
torch_allocated_bytes = torch.vacc.memory_stats(
)["allocated_bytes.all.current"]
total_allocated_bytes = torch.vacc.mem_get_info(
)[1] - torch.vacc.mem_get_info()[0]
non_torch_allocations = total_allocated_bytes - torch_allocated_bytes
if non_torch_allocations > 0:
peak_memory += non_torch_allocations
available_kv_cache_memory=total_memory*self.cache_config.gpu_memory_utilization - peak_memory
if self.model_config.hf_config.model_type == "deepseek_v3":
assert self.model_config.max_model_len <= 65536*self.vllm_config.parallel_config.pipeline_parallel_size, f"unsupported max model len, should less equal 65536 but got {self.model_config.max_model_len}"
# Rules:
# 1. always reserve N * 8K blocks
# 2. no less than (MAX_SEQ_NUM + 1) * 8K blocks
minimum_num_gpu_blocks_required = (max_seq_num + 1) * env_blk_grp_size // self.cache_config.block_size
max_model_len = (self.model_config.max_model_len + env_blk_grp_size - 1) // env_blk_grp_size * env_blk_grp_size
max_num_gpu_blocks = max_model_len // self.cache_config.block_size
# limited by available_kv_cache_memory
cache_block_size = self.get_cache_block_size_bytes()
if cache_block_size == 0:
num_gpu_blocks = 0
num_cpu_blocks = 0
else:
num_gpu_blocks = int(available_kv_cache_memory // cache_block_size)
num_cpu_blocks = int(self.cache_config.swap_space_bytes //
cache_block_size)
assert num_gpu_blocks >= minimum_num_gpu_blocks_required, \
f"num_gpu_blocks should >= {minimum_num_gpu_blocks_required} please increase VLLM_VACC_KVCACHE_SPACE"
torch.vacc.empty_cache()
# if self.model_runner.lora_manager:
# self.model_runner.remove_all_loras()
gc.collect()
if max_num_gpu_blocks != 0:
num_gpu_blocks = min(max_num_gpu_blocks, num_gpu_blocks)
num_gpu_blocks = max(num_gpu_blocks, minimum_num_gpu_blocks_required)
available_kv_cache_memory_min = num_gpu_blocks * cache_block_size
return available_kv_cache_memory_min, num_gpu_blocks
def compile_or_warm_up_model(self) -> None:
# Reset the seed to ensure that the random state is not affected by
# the model initialization and profiling.
set_random_seed(self.model_config.seed)
# self.model_runner.warming_up_model()
@torch.inference_mode()
def execute_model(
self,
scheduler_output: "SchedulerOutput",
) -> Optional[Union[ModelRunnerOutput, AsyncModelRunnerOutput]]:
intermediate_tensors = None
forward_pass = scheduler_output.total_num_scheduled_tokens > 0
if forward_pass and not get_pp_group().is_first_rank:
intermediate_tensors = IntermediateTensors(
get_pp_group().recv_tensor_dict(
all_gather_group=get_tp_group()))
output = self.model_runner.execute_model(scheduler_output,
intermediate_tensors)
if isinstance(output, (ModelRunnerOutput, AsyncModelRunnerOutput)):
return output
if not get_pp_group().is_last_rank:
assert isinstance(output, IntermediateTensors)
get_pp_group().send_tensor_dict(output.tensors,
all_gather_group=get_tp_group())
return None
assert isinstance(output, ModelRunnerOutput)
return output if self.is_driver_worker else None
def get_cpus_id_binding_based_on_numa_nodes(self) -> str:
"""Return VACCs id binding based on NUMA nodes.
"""
rank_to_cpus = self.local_omp_cpuid
# Setup OpenMP thread affinity based on NUMA nodes automatically
world_size = self.vllm_config.parallel_config.world_size
libnuma_found = util.find_spec("numa") is not None
psutil_found = util.find_spec("psutil") is not None
if libnuma_found and psutil_found:
import psutil
from numa import info
cpu_count = psutil.cpu_count(logical=False)
cpus_allow_list = psutil.Process().cpu_affinity()
numa_size = info.get_num_configured_nodes()
cpu_count_per_numa = cpu_count // numa_size
num_of_reserved_cpu = min(envs.VLLM_VACC_NUM_OF_RESERVED_VACC,
cpu_count_per_numa // 2)
# check allow node_to_cpus list
node_to_cpus = []
for i in range(numa_size):
node_intersect = set(
info.node_to_cpus(i)).intersection(cpus_allow_list)
if bool(node_intersect):
node_to_cpus.append(list(node_intersect))
if world_size > len(node_to_cpus):
logger.error(
"Auto thread-binding failed due to "
"world size: %d is larger than "
"allowed NUMA nodes number: %d."
"Please try to bind threads manually.", world_size,
len(node_to_cpus))
else:
end = cpu_count_per_numa - num_of_reserved_cpu
rank_to_cpus_list = node_to_cpus[self.rank][:end]
rank_to_cpus = ','.join(str(x) for x in rank_to_cpus_list)
logger.info("auto thread-binding list: %s", rank_to_cpus)
else:
logger.warning(
"Auto thread-binding is not supported due to "
"the lack of package numa and psutil,"
"fallback to no thread-binding. To get better performance,"
"please try to manually bind threads.")
return rank_to_cpus
def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
"""Allocate GPU KV cache with the specified kv_cache_config."""
if self.vllm_config.model_config.enable_sleep_mode:
allocator = CuMemAllocator.get_instance()
context = allocator.use_memory_pool(tag="kv_cache")
else:
from contextlib import nullcontext
context = nullcontext()
with context:
self.model_runner.initialize_kv_cache(kv_cache_config)