72 lines
1.9 KiB
Python
72 lines
1.9 KiB
Python
|
|
|
|||
|
|
|
|||
|
|
from enum import Enum
|
|||
|
|
import os
|
|||
|
|
import torch
|
|||
|
|
import vllm.envs as envs
|
|||
|
|
|
|||
|
|
zero_no_thread = os.environ.get('VLLM_ZERO_NO_THREAD') == '1'
|
|||
|
|
|
|||
|
|
def is_zero_no_thread():
|
|||
|
|
return zero_no_thread and envs.VLLM_ZERO_OVERHEAD
|
|||
|
|
|
|||
|
|
class SpecStepKind(Enum):
|
|||
|
|
KIND_DEFAULT = 0
|
|||
|
|
PREFILL = 1
|
|||
|
|
FIRST_PROPOSAL = 2
|
|||
|
|
OTHER_PROPOSAL = 3
|
|||
|
|
SCORE_DECODE = 4
|
|||
|
|
|
|||
|
|
class ZeroOverheadSpecContext():
|
|||
|
|
def __init__(self):
|
|||
|
|
self.step_kind = SpecStepKind.KIND_DEFAULT
|
|||
|
|
self.last_step = SpecStepKind.KIND_DEFAULT
|
|||
|
|
self.proposal_lens_list = None
|
|||
|
|
self.proposal_token_ids = None
|
|||
|
|
self.accepted_token_ids = None
|
|||
|
|
self.accepted_seq_ids = None
|
|||
|
|
|
|||
|
|
spec_context = ZeroOverheadSpecContext()
|
|||
|
|
|
|||
|
|
def set_spec_step(_step):
|
|||
|
|
global spec_context
|
|||
|
|
spec_context.last_step = spec_context.step_kind
|
|||
|
|
spec_context.step_kind = _step
|
|||
|
|
|
|||
|
|
def get_spec_step():
|
|||
|
|
return spec_context.step_kind
|
|||
|
|
|
|||
|
|
def get_spec_last_step():
|
|||
|
|
return spec_context.last_step
|
|||
|
|
|
|||
|
|
def record_proposal_lens_list(list):
|
|||
|
|
global spec_context
|
|||
|
|
spec_context.proposal_lens_list = list
|
|||
|
|
|
|||
|
|
def get_proposal_lens_list():
|
|||
|
|
return spec_context.proposal_lens_list
|
|||
|
|
|
|||
|
|
def record_proposal_token_ids(tensor):
|
|||
|
|
global spec_context
|
|||
|
|
spec_context.proposal_token_ids = tensor
|
|||
|
|
|
|||
|
|
def get_proposal_token_ids():
|
|||
|
|
return spec_context.proposal_token_ids
|
|||
|
|
|
|||
|
|
def record_accepted_token_ids(tensor, seq_ids):
|
|||
|
|
global spec_context
|
|||
|
|
spec_context.accepted_token_ids = tensor
|
|||
|
|
spec_context.accepted_seq_ids = seq_ids
|
|||
|
|
|
|||
|
|
def get_accepted_token_ids():
|
|||
|
|
return spec_context.accepted_token_ids, spec_context.accepted_seq_ids
|
|||
|
|
|
|||
|
|
# 零消耗调度不在默认流上推理,用以规避runtime引入的内存申请流同步问题。
|
|||
|
|
alloc_stream = {}
|
|||
|
|
|
|||
|
|
def zero_overhead_stream(target_device):
|
|||
|
|
"""Asynchronously create a tensor and copy it from host to device."""
|
|||
|
|
if target_device not in alloc_stream.keys():
|
|||
|
|
alloc_stream[target_device] = torch.cuda.Stream(device=target_device)
|
|||
|
|
return alloc_stream[target_device]
|