[Bugfix] fix kv buffer register & dp attention & deepepmoe (#9327)
This commit is contained in:
@@ -23,9 +23,7 @@ class AscendKVManager(MooncakeKVManager):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def register_buffer_to_engine(self):
|
def register_buffer_to_engine(self):
|
||||||
self.engine.register(
|
self.engine.batch_register(self.kv_args.kv_data_ptrs, self.kv_args.kv_data_lens)
|
||||||
self.kv_args.kv_data_ptrs[0], sum(self.kv_args.kv_data_lens)
|
|
||||||
)
|
|
||||||
# The Ascend backend optimize batch registration for small memory blocks.
|
# The Ascend backend optimize batch registration for small memory blocks.
|
||||||
self.engine.batch_register(
|
self.engine.batch_register(
|
||||||
self.kv_args.aux_data_ptrs, self.kv_args.aux_data_lens
|
self.kv_args.aux_data_ptrs, self.kv_args.aux_data_lens
|
||||||
|
|||||||
@@ -234,7 +234,7 @@ def initialize_dp_attention(
|
|||||||
_DpGatheredBufferWrapper.set_metadata(
|
_DpGatheredBufferWrapper.set_metadata(
|
||||||
hidden_size=model_config.hidden_size,
|
hidden_size=model_config.hidden_size,
|
||||||
dtype=model_config.dtype,
|
dtype=model_config.dtype,
|
||||||
device=torch.device("cuda"),
|
device=torch.device(server_args.device),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -736,7 +736,7 @@ class DeepEPMoE(EPMoE):
|
|||||||
assert isinstance(dispatch_output, AscendDeepEPLLOutput)
|
assert isinstance(dispatch_output, AscendDeepEPLLOutput)
|
||||||
hidden_states, topk_idx, topk_weights, _, seg_indptr, _ = dispatch_output
|
hidden_states, topk_idx, topk_weights, _, seg_indptr, _ = dispatch_output
|
||||||
assert self.quant_method is not None
|
assert self.quant_method is not None
|
||||||
assert self.activation == "silu"
|
assert self.moe_runner_config.activation == "silu"
|
||||||
|
|
||||||
# NOTE: Ascend's Dispatch & Combine does not support FP16
|
# NOTE: Ascend's Dispatch & Combine does not support FP16
|
||||||
output_dtype = torch.bfloat16
|
output_dtype = torch.bfloat16
|
||||||
|
|||||||
Reference in New Issue
Block a user