AITER backend extension and workload optimizations (#6838)
Co-authored-by: wunhuang <wunhuang@amd.com> Co-authored-by: Hubert Lu <Hubert.Lu@amd.com>
This commit is contained in:
@@ -355,6 +355,15 @@ class ModelRunner:
|
||||
# MLA architecture
|
||||
if is_hopper_with_cuda_12_3():
|
||||
server_args.attention_backend = "fa3"
|
||||
elif _is_hip:
|
||||
head_num = self.model_config.get_num_kv_heads(self.tp_size)
|
||||
# TODO current aiter only support head number 16 or 128 head number
|
||||
if (
|
||||
head_num == 128 or head_num == 16
|
||||
) and self.spec_algorithm.is_none():
|
||||
server_args.attention_backend = "aiter"
|
||||
else:
|
||||
server_args.attention_backend = "triton"
|
||||
else:
|
||||
server_args.attention_backend = "triton"
|
||||
logger.info(
|
||||
@@ -363,6 +372,7 @@ class ModelRunner:
|
||||
elif self.use_mla_backend:
|
||||
if server_args.device != "cpu":
|
||||
if server_args.attention_backend in [
|
||||
"aiter",
|
||||
"flashinfer",
|
||||
"fa3",
|
||||
"triton",
|
||||
|
||||
Reference in New Issue
Block a user