42 lines
1.2 KiB
Python
42 lines
1.2 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
|
|
#
|
|
# This file contains code copied from the flash-linear-attention project.
|
|
# The original source code was licensed under the MIT license and included
|
|
# the following copyright notice:
|
|
# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
|
|
# ruff: noqa: E501
|
|
import torch
|
|
|
|
from vllm.triton_utils import triton
|
|
|
|
from .utils import tensor_cache
|
|
|
|
|
|
@tensor_cache
|
|
def prepare_lens(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
|
|
return cu_seqlens[1:] - cu_seqlens[:-1]
|
|
|
|
|
|
@tensor_cache
|
|
def prepare_chunk_indices(
|
|
cu_seqlens: torch.LongTensor, chunk_size: int
|
|
) -> torch.LongTensor:
|
|
indices = torch.cat(
|
|
[
|
|
torch.arange(n)
|
|
for n in triton.cdiv(prepare_lens(cu_seqlens), chunk_size).tolist()
|
|
]
|
|
)
|
|
return torch.stack([indices.eq(0).cumsum(0) - 1, indices], 1).to(cu_seqlens)
|
|
|
|
|
|
@tensor_cache
|
|
def prepare_chunk_offsets(
|
|
cu_seqlens: torch.LongTensor, chunk_size: int
|
|
) -> torch.LongTensor:
|
|
return torch.cat(
|
|
[cu_seqlens.new_tensor([0]), triton.cdiv(prepare_lens(cu_seqlens), chunk_size)]
|
|
).cumsum(-1)
|