fix double sparsity initialization (#6905)
This commit is contained in:
@@ -341,6 +341,14 @@ class ModelRunner:
|
|||||||
if server_args.enable_lora:
|
if server_args.enable_lora:
|
||||||
self.init_lora_manager()
|
self.init_lora_manager()
|
||||||
|
|
||||||
|
# Init Double Sparsity
|
||||||
|
if server_args.enable_double_sparsity:
|
||||||
|
if server_args.ds_heavy_channel_type is None:
|
||||||
|
raise ValueError(
|
||||||
|
"Please specify the heavy channel type for double sparsity optimization."
|
||||||
|
)
|
||||||
|
self.init_double_sparsity_channel_config(server_args.ds_heavy_channel_type)
|
||||||
|
|
||||||
# Init memory pool and attention backends
|
# Init memory pool and attention backends
|
||||||
self.init_memory_pool(
|
self.init_memory_pool(
|
||||||
min_per_gpu_memory,
|
min_per_gpu_memory,
|
||||||
@@ -506,11 +514,6 @@ class ModelRunner:
|
|||||||
)
|
)
|
||||||
server_args.attention_backend = "triton"
|
server_args.attention_backend = "triton"
|
||||||
server_args.disable_cuda_graph = True
|
server_args.disable_cuda_graph = True
|
||||||
if server_args.ds_heavy_channel_type is None:
|
|
||||||
raise ValueError(
|
|
||||||
"Please specify the heavy channel type for double sparsity optimization."
|
|
||||||
)
|
|
||||||
self.init_double_sparsity_channel_config(server_args.ds_heavy_channel_type)
|
|
||||||
|
|
||||||
if self.is_multimodal:
|
if self.is_multimodal:
|
||||||
if not self.is_multimodal_chunked_prefill_supported:
|
if not self.is_multimodal_chunked_prefill_supported:
|
||||||
|
|||||||
Reference in New Issue
Block a user