[Fix] Fix wrong backend chosen in hybrid backend (#8989)
This commit is contained in:
@@ -378,6 +378,12 @@ class ModelRunner:
|
|||||||
)
|
)
|
||||||
server_args.attention_backend = "torch_native"
|
server_args.attention_backend = "torch_native"
|
||||||
|
|
||||||
|
if server_args.prefill_attention_backend is not None and (
|
||||||
|
server_args.prefill_attention_backend
|
||||||
|
== server_args.decode_attention_backend
|
||||||
|
): # override the default attention backend
|
||||||
|
server_args.attention_backend = server_args.prefill_attention_backend
|
||||||
|
|
||||||
if server_args.attention_backend is None:
|
if server_args.attention_backend is None:
|
||||||
"""
|
"""
|
||||||
Auto select the fastest attention backend.
|
Auto select the fastest attention backend.
|
||||||
|
|||||||
Reference in New Issue
Block a user