[gpt-oss] Add gpt-oss bf16 support

2025-08-13 21:25:57 +08:00
parent 5d2e7edf78
commit 17ea2ec6aa
1232 changed files with 777 additions and 36 deletions
--- a/vllm/v1/spec_decode/utils.py
+++ b/vllm/v1/spec_decode/utils.py
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.triton_utils import tl, triton
+from vllm.v1.worker.gpu_input_batch import InputBatch
+
+
+def is_spec_decode_supported(req_id: str, input_batch: InputBatch) -> bool:
+    if req_id in input_batch.min_p_reqs:
+        # Spec decode doesn't support min_p sampling.
+        return False
+    elif (req_id in input_batch.frequency_penalties_reqs
+          or req_id in input_batch.presence_penalties_reqs
+          or req_id in input_batch.repetition_penalties_reqs):
+        # Spec decode doesn't support penalties.
+        return False
+    elif req_id in input_batch.num_logprobs:
+        # Spec decode doesn't support logprobs.
+        return False
+
+    return True
+
+
+@triton.jit
+def prepare_eagle_input_kernel(
+    out_ptr,
+    cu_query_lens_ptr,
+    cu_num_tokens_ptr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    # [start_pos, end_pos)
+    start_pos = tl.load(cu_num_tokens_ptr + pid)
+    end_pos = tl.load(cu_num_tokens_ptr + pid + 1)
+    num_tokens = end_pos - start_pos
+
+    index_start = tl.load(cu_query_lens_ptr + pid)
+
+    num_blocks = tl.cdiv(num_tokens, BLOCK_SIZE)
+    for i in tl.range(num_blocks):
+        offset = i * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+        tl.store(
+            out_ptr + start_pos + offset,
+            index_start + offset,
+            mask=offset < num_tokens,
+        )