use sgl-kernel moe_align_block_size (#2581)
Co-authored-by: ispobock <ispobaoke@163.com> Co-authored-by: HandH1998 <1335248067@qq.com>
This commit is contained in:
@@ -95,6 +95,12 @@ class ModelRunner:
|
||||
):
|
||||
logger.info("MLA optimization is turned on. Use triton backend.")
|
||||
self.server_args.attention_backend = "triton"
|
||||
# FIXME(HandH1998)
|
||||
if (
|
||||
"DeepseekV3ForCausalLM" in self.model_config.hf_config.architectures
|
||||
and not self.server_args.disable_cuda_graph
|
||||
):
|
||||
self.server_args.disable_cuda_graph = True
|
||||
|
||||
if self.server_args.enable_double_sparsity:
|
||||
logger.info(
|
||||
|
||||
Reference in New Issue
Block a user