Revert "Fix: Dynamic RoPE Cache Expansion to Prevent Position-ID Out-of-Bounds in EAGLE + Long-Sequence Workloads" (#11827)
This commit is contained in:
@@ -140,7 +140,6 @@ from sglang.srt.utils import (
|
||||
log_info_on_rank0,
|
||||
monkey_patch_p2p_access_check,
|
||||
monkey_patch_vllm_gguf_config,
|
||||
reserve_rope_cache_for_long_sequences,
|
||||
set_cuda_arch,
|
||||
slow_rank_detector,
|
||||
)
|
||||
@@ -899,15 +898,6 @@ class ModelRunner:
|
||||
f"mem usage={self.weight_load_mem_usage:.2f} GB."
|
||||
)
|
||||
|
||||
# Pre-expand RoPE cache before CUDA Graph capture
|
||||
reserve_rope_cache_for_long_sequences(
|
||||
self.model,
|
||||
self.server_args,
|
||||
self.model_config,
|
||||
self.req_to_token_pool,
|
||||
logger,
|
||||
)
|
||||
|
||||
if self.server_args.elastic_ep_backend == "mooncake":
|
||||
# Mooncake does not support `monitored_barrier`
|
||||
dist.barrier(group=get_tp_group().cpu_group)
|
||||
|
||||
Reference in New Issue
Block a user