Fuse quantize and rope in trtllm_mla MTP (#10779)
This commit is contained in:
@@ -1399,7 +1399,10 @@ class DeepseekV2AttentionMLA(nn.Module):
|
||||
"""
|
||||
return (
|
||||
self.current_attention_backend == "trtllm_mla"
|
||||
and forward_batch.forward_mode.is_decode_or_idle()
|
||||
and (
|
||||
forward_batch.forward_mode.is_decode_or_idle()
|
||||
or forward_batch.forward_mode.is_target_verify()
|
||||
)
|
||||
and forward_batch.attn_backend.data_type == torch.float8_e4m3fn
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user