Fix: Dynamic RoPE Cache Expansion to Prevent Position-ID Out-of-Bounds in EAGLE + Long-Sequence Workloads (#10788)

This commit is contained in:
YAMY
2025-10-18 20:37:43 -07:00
committed by GitHub
parent b288f4f440
commit 80407b0493
4 changed files with 103 additions and 0 deletions

View File

@@ -140,6 +140,7 @@ from sglang.srt.utils import (
log_info_on_rank0,
monkey_patch_p2p_access_check,
monkey_patch_vllm_gguf_config,
reserve_rope_cache_for_long_sequences,
set_cuda_arch,
slow_rank_detector,
)
@@ -898,6 +899,15 @@ class ModelRunner:
f"mem usage={self.weight_load_mem_usage:.2f} GB."
)
# Pre-expand RoPE cache before CUDA Graph capture
reserve_rope_cache_for_long_sequences(
self.model,
self.server_args,
self.model_config,
self.req_to_token_pool,
logger,
)
if self.server_args.elastic_ep_backend == "mooncake":
# Mooncake does not support `monitored_barrier`
dist.barrier(group=get_tp_group().cpu_group)