Optimize topk operation in llama4 (#5128)

2025-04-09 17:50:22 +08:00
parent 92823069c4
commit 86a876d883
4 changed files with 18 additions and 15 deletions
--- a/python/sglang/srt/speculative/eagle_worker.py
+++ b/python/sglang/srt/speculative/eagle_worker.py
@@ -31,11 +31,15 @@ from sglang.srt.speculative.eagle_utils import (
    EagleVerifyInput,
    EagleVerifyOutput,
    assign_draft_cache_locs,
-    fast_topk,
    select_top_k_tokens,
 )
 from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
-from sglang.srt.utils import empty_context, get_available_gpu_memory, is_cuda_available
+from sglang.srt.utils import (
+    empty_context,
+    fast_topk,
+    get_available_gpu_memory,
+    is_cuda_available,
+)

 if is_cuda_available():
    from sgl_kernel import segment_packbits