diff --git a/python/sglang/srt/speculative/draft_utils.py b/python/sglang/srt/speculative/draft_utils.py index fd856b61e..aab54cc62 100644 --- a/python/sglang/srt/speculative/draft_utils.py +++ b/python/sglang/srt/speculative/draft_utils.py @@ -96,7 +96,6 @@ class DraftBackendFactory: FlashInferMultiStepDraftBackend, ) - self.has_prefill_wrapper_verify = True return FlashInferMultiStepDraftBackend( self.draft_model_runner, self.topk, self.speculative_num_steps ) @@ -105,7 +104,6 @@ class DraftBackendFactory: FlashInferMLAMultiStepDraftBackend, ) - self.has_prefill_wrapper_verify = True return FlashInferMLAMultiStepDraftBackend( self.draft_model_runner, self.topk, self.speculative_num_steps ) @@ -149,7 +147,6 @@ class DraftBackendFactory: TRTLLMHAAttnMultiStepDraftBackend, ) - self.has_prefill_wrapper_verify = True return TRTLLMHAAttnMultiStepDraftBackend( self.draft_model_runner, self.topk, self.speculative_num_steps ) @@ -164,7 +161,6 @@ class DraftBackendFactory: TRTLLMMLAMultiStepDraftBackend, ) - self.has_prefill_wrapper_verify = True return TRTLLMMLAMultiStepDraftBackend( self.draft_model_runner, self.topk, self.speculative_num_steps ) diff --git a/python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py b/python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py index b538a4bf8..7a04b5c12 100644 --- a/python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +++ b/python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py @@ -41,7 +41,6 @@ class EAGLEDraftCudaGraphRunner: # Parse args self.eagle_worker = eagle_worker self.model_runner = model_runner = eagle_worker.model_runner - self.model_runner: EAGLEWorker self.graphs = {} self.output_buffers = {} self.enable_torch_compile = model_runner.server_args.enable_torch_compile diff --git a/python/sglang/srt/speculative/eagle_worker.py b/python/sglang/srt/speculative/eagle_worker.py index 08282e533..d152bf8fd 100644 --- a/python/sglang/srt/speculative/eagle_worker.py +++ b/python/sglang/srt/speculative/eagle_worker.py @@ -192,10 +192,6 @@ class EAGLEWorker(TpModelWorker): def init_attention_backend(self): # Create multi-step attn backends and cuda graph runners - - self.has_prefill_wrapper_verify = False - self.draft_extend_attn_backend = None - draft_backend_factory = DraftBackendFactory( self.server_args, self.draft_model_runner,