[300I] support decode-only aclgraph mode (#6849)
### What this PR does / why we need it?
310p aclgraph mode, but has some problems:
- the event-id hardware limit, the num of graph will be limited.
- the cann version support this feature cannot be get from external of
huawei.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
local test
- vLLM version: v0.15.0
- vLLM main:
83b47f67b1
Signed-off-by: Tflowers-0129 <2906339855@qq.com>
This commit is contained in:
@@ -110,7 +110,19 @@ class AscendAttentionBackendImpl310(AscendAttentionBackendImpl):
|
|||||||
device=query.device,
|
device=query.device,
|
||||||
non_blocking=True,
|
non_blocking=True,
|
||||||
)
|
)
|
||||||
return super().forward_paged_attention(query, attn_metadata, output)
|
|
||||||
|
torch_npu._npu_paged_attention(
|
||||||
|
query=query,
|
||||||
|
key_cache=self.key_cache,
|
||||||
|
value_cache=self.value_cache,
|
||||||
|
num_kv_heads=self.num_kv_heads,
|
||||||
|
num_heads=self.num_heads,
|
||||||
|
scale_value=self.scale,
|
||||||
|
block_table=attn_metadata.block_tables,
|
||||||
|
context_lens=attn_metadata.seq_lens,
|
||||||
|
out=output,
|
||||||
|
)
|
||||||
|
return output
|
||||||
|
|
||||||
def forward_prefill_310(self, query, key, value, attn_metadata, output):
|
def forward_prefill_310(self, query, key, value, attn_metadata, output):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -833,7 +833,7 @@ def weak_ref_tensor(tensor: Any) -> Any:
|
|||||||
but will not keep the original tensor alive.
|
but will not keep the original tensor alive.
|
||||||
"""
|
"""
|
||||||
if isinstance(tensor, torch.Tensor):
|
if isinstance(tensor, torch.Tensor):
|
||||||
return torch.ops._C_ascend.weak_ref_tensor(tensor)
|
return torch_npu._C._weak_ref_tensor(tensor)
|
||||||
else:
|
else:
|
||||||
return tensor
|
return tensor
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user