diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index fa35fd14b..8642812fd 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -341,6 +341,14 @@ class ModelRunner: if server_args.enable_lora: self.init_lora_manager() + # Init Double Sparsity + if server_args.enable_double_sparsity: + if server_args.ds_heavy_channel_type is None: + raise ValueError( + "Please specify the heavy channel type for double sparsity optimization." + ) + self.init_double_sparsity_channel_config(server_args.ds_heavy_channel_type) + # Init memory pool and attention backends self.init_memory_pool( min_per_gpu_memory, @@ -506,11 +514,6 @@ class ModelRunner: ) server_args.attention_backend = "triton" server_args.disable_cuda_graph = True - if server_args.ds_heavy_channel_type is None: - raise ValueError( - "Please specify the heavy channel type for double sparsity optimization." - ) - self.init_double_sparsity_channel_config(server_args.ds_heavy_channel_type) if self.is_multimodal: if not self.is_multimodal_chunked_prefill_supported: