[Eagle] Refactor eagle speculative decoding (#3986)

Co-authored-by: Ke Bao <ISPObaoke@163.com>
2025-03-05 08:06:07 -08:00
parent 5be8f1ed98
commit d3d4d76758
22 changed files with 670 additions and 352 deletions
--- a/python/sglang/bench_one_batch.py
+++ b/python/sglang/bench_one_batch.py
@@ -230,7 +230,7 @@ def extend(reqs, model_runner):
    batch = ScheduleBatch.init_new(
        reqs=reqs,
        req_to_token_pool=model_runner.req_to_token_pool,
-        token_to_kv_pool=model_runner.token_to_kv_pool,
+        token_to_kv_pool_allocator=model_runner.token_to_kv_pool_allocator,
        tree_cache=None,
        model_config=model_runner.model_config,
        enable_overlap=False,
@@ -326,7 +326,7 @@ def latency_test_run_once(

    # Clear the pools.
    model_runner.req_to_token_pool.clear()
-    model_runner.token_to_kv_pool.clear()
+    model_runner.token_to_kv_pool_allocator.clear()

    measurement_results = {
        "run_name": run_name,