72 lines
1.9 KiB
Python
72 lines
1.9 KiB
Python
|
||
|
||
from enum import Enum
|
||
import os
|
||
import torch
|
||
import vllm.envs as envs
|
||
|
||
zero_no_thread = os.environ.get('VLLM_ZERO_NO_THREAD') == '1'
|
||
|
||
def is_zero_no_thread():
|
||
return zero_no_thread and envs.VLLM_ZERO_OVERHEAD
|
||
|
||
class SpecStepKind(Enum):
|
||
KIND_DEFAULT = 0
|
||
PREFILL = 1
|
||
FIRST_PROPOSAL = 2
|
||
OTHER_PROPOSAL = 3
|
||
SCORE_DECODE = 4
|
||
|
||
class ZeroOverheadSpecContext():
|
||
def __init__(self):
|
||
self.step_kind = SpecStepKind.KIND_DEFAULT
|
||
self.last_step = SpecStepKind.KIND_DEFAULT
|
||
self.proposal_lens_list = None
|
||
self.proposal_token_ids = None
|
||
self.accepted_token_ids = None
|
||
self.accepted_seq_ids = None
|
||
|
||
spec_context = ZeroOverheadSpecContext()
|
||
|
||
def set_spec_step(_step):
|
||
global spec_context
|
||
spec_context.last_step = spec_context.step_kind
|
||
spec_context.step_kind = _step
|
||
|
||
def get_spec_step():
|
||
return spec_context.step_kind
|
||
|
||
def get_spec_last_step():
|
||
return spec_context.last_step
|
||
|
||
def record_proposal_lens_list(list):
|
||
global spec_context
|
||
spec_context.proposal_lens_list = list
|
||
|
||
def get_proposal_lens_list():
|
||
return spec_context.proposal_lens_list
|
||
|
||
def record_proposal_token_ids(tensor):
|
||
global spec_context
|
||
spec_context.proposal_token_ids = tensor
|
||
|
||
def get_proposal_token_ids():
|
||
return spec_context.proposal_token_ids
|
||
|
||
def record_accepted_token_ids(tensor, seq_ids):
|
||
global spec_context
|
||
spec_context.accepted_token_ids = tensor
|
||
spec_context.accepted_seq_ids = seq_ids
|
||
|
||
def get_accepted_token_ids():
|
||
return spec_context.accepted_token_ids, spec_context.accepted_seq_ids
|
||
|
||
# 零消耗调度不在默认流上推理,用以规避runtime引入的内存申请流同步问题。
|
||
alloc_stream = {}
|
||
|
||
def zero_overhead_stream(target_device):
|
||
"""Asynchronously create a tensor and copy it from host to device."""
|
||
if target_device not in alloc_stream.keys():
|
||
alloc_stream[target_device] = torch.cuda.Stream(device=target_device)
|
||
return alloc_stream[target_device]
|