longcontext chunk make attention crash, fix it (#117)
Co-authored-by: root <root@rdtest-node1150.bcc-zwlt.baidu.com>
This commit is contained in:
26
vllm_kunlun/ops/attention/merge_attn_states.py
Normal file
26
vllm_kunlun/ops/attention/merge_attn_states.py
Normal file
@@ -0,0 +1,26 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
import xtorch_ops
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
def merge_attn_states(
|
||||
output: torch.Tensor,
|
||||
prefix_output: torch.Tensor,
|
||||
prefix_lse: torch.Tensor,
|
||||
suffix_output: torch.Tensor,
|
||||
suffix_lse: torch.Tensor,
|
||||
output_lse: Optional[torch.Tensor] = None,
|
||||
) -> None:
|
||||
|
||||
return xtorch_ops.attention_merge_stage(
|
||||
prefix_output,
|
||||
prefix_lse,
|
||||
suffix_output,
|
||||
suffix_lse,
|
||||
output,
|
||||
output_lse
|
||||
)
|
||||
Reference in New Issue
Block a user