[Feature] Support NPUGraph for DeepSeek on Ascend NPU (#9355)
Co-authored-by: Even Zhou <even.y.zhou@outlook.com>
This commit is contained in:
@@ -1,6 +1,12 @@
|
||||
import concurrent.futures
|
||||
import logging
|
||||
from typing import List, Tuple
|
||||
|
||||
import numpy as np
|
||||
import numpy.typing as npt
|
||||
|
||||
from sglang.srt.disaggregation.ascend.transfer_engine import AscendTransferEngine
|
||||
from sglang.srt.disaggregation.common.utils import group_concurrent_contiguous
|
||||
from sglang.srt.disaggregation.mooncake.conn import (
|
||||
MooncakeKVBootstrapServer,
|
||||
MooncakeKVManager,
|
||||
@@ -29,6 +35,75 @@ class AscendKVManager(MooncakeKVManager):
|
||||
self.kv_args.aux_data_ptrs, self.kv_args.aux_data_lens
|
||||
)
|
||||
|
||||
def send_kvcache(
|
||||
self,
|
||||
mooncake_session_id: str,
|
||||
prefill_kv_indices: npt.NDArray[np.int32],
|
||||
dst_kv_ptrs: list[int],
|
||||
dst_kv_indices: npt.NDArray[np.int32],
|
||||
executor: concurrent.futures.ThreadPoolExecutor,
|
||||
):
|
||||
# Group by indices
|
||||
prefill_kv_blocks, dst_kv_blocks = group_concurrent_contiguous(
|
||||
prefill_kv_indices, dst_kv_indices
|
||||
)
|
||||
|
||||
num_layers = len(self.kv_args.kv_data_ptrs)
|
||||
layers_params = [
|
||||
(
|
||||
self.kv_args.kv_data_ptrs[layer_id],
|
||||
dst_kv_ptrs[layer_id],
|
||||
self.kv_args.kv_item_lens[layer_id],
|
||||
)
|
||||
for layer_id in range(num_layers)
|
||||
]
|
||||
|
||||
def set_transfer_blocks(
|
||||
src_ptr: int, dst_ptr: int, item_len: int
|
||||
) -> List[Tuple[int, int, int]]:
|
||||
transfer_blocks = []
|
||||
for prefill_index, decode_index in zip(prefill_kv_blocks, dst_kv_blocks):
|
||||
src_addr = src_ptr + int(prefill_index[0]) * item_len
|
||||
dst_addr = dst_ptr + int(decode_index[0]) * item_len
|
||||
length = item_len * len(prefill_index)
|
||||
transfer_blocks.append((src_addr, dst_addr, length))
|
||||
return transfer_blocks
|
||||
|
||||
# Worker function for processing a single layer
|
||||
def process_layer(src_ptr: int, dst_ptr: int, item_len: int) -> int:
|
||||
transfer_blocks = set_transfer_blocks(src_ptr, dst_ptr, item_len)
|
||||
return self._transfer_data(mooncake_session_id, transfer_blocks)
|
||||
|
||||
# Worker function for processing all layers in a batch
|
||||
def process_layers(layers_params: List[Tuple[int, int, int]]) -> int:
|
||||
transfer_blocks = []
|
||||
for src_ptr, dst_ptr, item_len in layers_params:
|
||||
transfer_blocks.extend(set_transfer_blocks(src_ptr, dst_ptr, item_len))
|
||||
return self._transfer_data(mooncake_session_id, transfer_blocks)
|
||||
|
||||
if self.enable_custom_mem_pool:
|
||||
futures = [
|
||||
executor.submit(
|
||||
process_layer,
|
||||
src_ptr,
|
||||
dst_ptr,
|
||||
item_len,
|
||||
)
|
||||
for (src_ptr, dst_ptr, item_len) in layers_params
|
||||
]
|
||||
for future in concurrent.futures.as_completed(futures):
|
||||
status = future.result()
|
||||
if status != 0:
|
||||
for f in futures:
|
||||
f.cancel()
|
||||
return status
|
||||
else:
|
||||
# Combining all layers' params in one batch transfer is more efficient
|
||||
# compared to using multiple threads
|
||||
return process_layers(layers_params)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
class AscendKVSender(MooncakeKVSender):
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user