Fix attention backend (#1448)
This commit is contained in:
@@ -86,6 +86,14 @@ class ModelRunner:
|
|||||||
self.is_multimodal_model = is_multimodal_model(
|
self.is_multimodal_model = is_multimodal_model(
|
||||||
self.model_config.hf_config.architectures
|
self.model_config.hf_config.architectures
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if (
|
||||||
|
self.model_config.attention_arch == AttentionArch.MLA
|
||||||
|
and not self.server_args.disable_mla
|
||||||
|
):
|
||||||
|
logger.info("MLA optimization is tunred on. Use triton backend.")
|
||||||
|
self.server_args.attention_backend = "triton"
|
||||||
|
|
||||||
global_server_args_dict.update(
|
global_server_args_dict.update(
|
||||||
{
|
{
|
||||||
"attention_backend": server_args.attention_backend,
|
"attention_backend": server_args.attention_backend,
|
||||||
|
|||||||
@@ -173,10 +173,6 @@ class ServerArgs:
|
|||||||
self.sampling_backend = "pytorch"
|
self.sampling_backend = "pytorch"
|
||||||
|
|
||||||
# Default kernel backends
|
# Default kernel backends
|
||||||
if not self.disable_mla:
|
|
||||||
logger.info("MLA optimization is tunred on. Use triton backend.")
|
|
||||||
self.attention_backend = "triton"
|
|
||||||
|
|
||||||
if self.attention_backend is None:
|
if self.attention_backend is None:
|
||||||
self.attention_backend = "flashinfer"
|
self.attention_backend = "flashinfer"
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user