init src 0.9.2

2026-01-09 15:09:53 +08:00
parent 0eb2c0a4b3
commit 41d98d4359
1438 changed files with 417605 additions and 683 deletions
--- a/vllm/zero_overhead/utils.py
+++ b/vllm/zero_overhead/utils.py
@@ -0,0 +1,71 @@
+
+
+from enum import Enum
+import os
+import torch
+import vllm.envs as envs
+
+zero_no_thread = os.environ.get('VLLM_ZERO_NO_THREAD') == '1'
+
+def is_zero_no_thread():
+    return zero_no_thread and envs.VLLM_ZERO_OVERHEAD
+
+class SpecStepKind(Enum):
+    KIND_DEFAULT = 0
+    PREFILL = 1
+    FIRST_PROPOSAL = 2
+    OTHER_PROPOSAL = 3
+    SCORE_DECODE = 4
+
+class ZeroOverheadSpecContext():
+    def __init__(self):
+        self.step_kind = SpecStepKind.KIND_DEFAULT
+        self.last_step = SpecStepKind.KIND_DEFAULT
+        self.proposal_lens_list = None
+        self.proposal_token_ids = None
+        self.accepted_token_ids = None
+        self.accepted_seq_ids = None
+
+spec_context = ZeroOverheadSpecContext()  
+
+def set_spec_step(_step):
+    global spec_context
+    spec_context.last_step = spec_context.step_kind
+    spec_context.step_kind = _step
+
+def get_spec_step():
+    return spec_context.step_kind
+
+def get_spec_last_step():
+    return spec_context.last_step
+
+def record_proposal_lens_list(list):
+    global spec_context
+    spec_context.proposal_lens_list = list
+
+def get_proposal_lens_list():
+    return spec_context.proposal_lens_list
+
+def record_proposal_token_ids(tensor):
+    global spec_context
+    spec_context.proposal_token_ids = tensor
+
+def get_proposal_token_ids():
+    return spec_context.proposal_token_ids
+
+def record_accepted_token_ids(tensor, seq_ids):
+    global spec_context
+    spec_context.accepted_token_ids = tensor
+    spec_context.accepted_seq_ids = seq_ids
+
+def get_accepted_token_ids():
+    return spec_context.accepted_token_ids, spec_context.accepted_seq_ids
+
+# 零消耗调度不在默认流上推理，用以规避runtime引入的内存申请流同步问题。
+alloc_stream = {}
+
+def zero_overhead_stream(target_device):
+    """Asynchronously create a tensor and copy it from host to device."""
+    if target_device not in alloc_stream.keys():
+        alloc_stream[target_device] = torch.cuda.Stream(device=target_device)
+    return alloc_stream[target_device]