fix double sparsity initialization (#6905)

This commit is contained in:
Chi-Chih Chang
2025-09-06 11:45:24 +08:00
committed by GitHub
parent 8d114f254b
commit ad26f298e2

View File

@@ -341,6 +341,14 @@ class ModelRunner:
if server_args.enable_lora:
self.init_lora_manager()
# Init Double Sparsity
if server_args.enable_double_sparsity:
if server_args.ds_heavy_channel_type is None:
raise ValueError(
"Please specify the heavy channel type for double sparsity optimization."
)
self.init_double_sparsity_channel_config(server_args.ds_heavy_channel_type)
# Init memory pool and attention backends
self.init_memory_pool(
min_per_gpu_memory,
@@ -506,11 +514,6 @@ class ModelRunner:
)
server_args.attention_backend = "triton"
server_args.disable_cuda_graph = True
if server_args.ds_heavy_channel_type is None:
raise ValueError(
"Please specify the heavy channel type for double sparsity optimization."
)
self.init_double_sparsity_channel_config(server_args.ds_heavy_channel_type)
if self.is_multimodal:
if not self.is_multimodal_chunked_prefill_supported: