init src 0.9.2
This commit is contained in:
71
vllm/zero_overhead/utils.py
Normal file
71
vllm/zero_overhead/utils.py
Normal file
@@ -0,0 +1,71 @@
|
||||
|
||||
|
||||
from enum import Enum
|
||||
import os
|
||||
import torch
|
||||
import vllm.envs as envs
|
||||
|
||||
zero_no_thread = os.environ.get('VLLM_ZERO_NO_THREAD') == '1'
|
||||
|
||||
def is_zero_no_thread():
|
||||
return zero_no_thread and envs.VLLM_ZERO_OVERHEAD
|
||||
|
||||
class SpecStepKind(Enum):
|
||||
KIND_DEFAULT = 0
|
||||
PREFILL = 1
|
||||
FIRST_PROPOSAL = 2
|
||||
OTHER_PROPOSAL = 3
|
||||
SCORE_DECODE = 4
|
||||
|
||||
class ZeroOverheadSpecContext():
|
||||
def __init__(self):
|
||||
self.step_kind = SpecStepKind.KIND_DEFAULT
|
||||
self.last_step = SpecStepKind.KIND_DEFAULT
|
||||
self.proposal_lens_list = None
|
||||
self.proposal_token_ids = None
|
||||
self.accepted_token_ids = None
|
||||
self.accepted_seq_ids = None
|
||||
|
||||
spec_context = ZeroOverheadSpecContext()
|
||||
|
||||
def set_spec_step(_step):
|
||||
global spec_context
|
||||
spec_context.last_step = spec_context.step_kind
|
||||
spec_context.step_kind = _step
|
||||
|
||||
def get_spec_step():
|
||||
return spec_context.step_kind
|
||||
|
||||
def get_spec_last_step():
|
||||
return spec_context.last_step
|
||||
|
||||
def record_proposal_lens_list(list):
|
||||
global spec_context
|
||||
spec_context.proposal_lens_list = list
|
||||
|
||||
def get_proposal_lens_list():
|
||||
return spec_context.proposal_lens_list
|
||||
|
||||
def record_proposal_token_ids(tensor):
|
||||
global spec_context
|
||||
spec_context.proposal_token_ids = tensor
|
||||
|
||||
def get_proposal_token_ids():
|
||||
return spec_context.proposal_token_ids
|
||||
|
||||
def record_accepted_token_ids(tensor, seq_ids):
|
||||
global spec_context
|
||||
spec_context.accepted_token_ids = tensor
|
||||
spec_context.accepted_seq_ids = seq_ids
|
||||
|
||||
def get_accepted_token_ids():
|
||||
return spec_context.accepted_token_ids, spec_context.accepted_seq_ids
|
||||
|
||||
# 零消耗调度不在默认流上推理,用以规避runtime引入的内存申请流同步问题。
|
||||
alloc_stream = {}
|
||||
|
||||
def zero_overhead_stream(target_device):
|
||||
"""Asynchronously create a tensor and copy it from host to device."""
|
||||
if target_device not in alloc_stream.keys():
|
||||
alloc_stream[target_device] = torch.cuda.Stream(device=target_device)
|
||||
return alloc_stream[target_device]
|
||||
Reference in New Issue
Block a user