[CustomOp] support TensorList for dispatchFFNCombine (#5665)
### What this PR does / why we need it?
To support tensorList for dispatch_ffn_combine, to adjust eplb
### Does this PR introduce _any_ user-facing change?
N/A
### How was this patch tested?
Single Operator Testing
- vLLM version: v0.13.0
- vLLM main:
2f4e6548ef
---------
Signed-off-by: lhchg <lhao_cheng@163.com>
Co-authored-by: lihaocheng <lihaosheng1@h-partners.com>
This commit is contained in:
@@ -727,11 +727,11 @@ void batch_matmul_transpose(const at::Tensor &tensor_a, const at::Tensor &tensor
|
||||
|
||||
at::Tensor& dispatch_ffn_combine(
|
||||
const at::Tensor& x,
|
||||
const at::Tensor& weight1,
|
||||
const at::Tensor& weight2,
|
||||
const at::TensorList& weight1,
|
||||
const at::TensorList& weight2,
|
||||
const at::Tensor& expert_idx,
|
||||
const at::Tensor& scale1,
|
||||
const at::Tensor& scale2,
|
||||
const at::TensorList& scale1,
|
||||
const at::TensorList& scale2,
|
||||
const at::Tensor& probs,
|
||||
c10::string_view group,
|
||||
int64_t max_output_size,
|
||||
@@ -1383,8 +1383,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(_C, _ascend), ops)
|
||||
ops.impl("npu_sparse_flash_attention", torch::kPrivateUse1, &vllm_ascend::npu_sparse_flash_attention);
|
||||
|
||||
ops.def(
|
||||
"dispatch_ffn_combine(Tensor x, Tensor weight1, Tensor weight2, Tensor expert_idx,"
|
||||
" Tensor scale1, Tensor scale2, Tensor probs, str group,"
|
||||
"dispatch_ffn_combine(Tensor x, Tensor[] weight1, Tensor[] weight2, Tensor expert_idx,"
|
||||
" Tensor[] scale1, Tensor[] scale2, Tensor probs, str group,"
|
||||
" int max_output_size, Tensor! out) -> Tensor"
|
||||
);
|
||||
ops.impl("dispatch_ffn_combine", torch::kPrivateUse1, &vllm_ascend::dispatch_ffn_combine);
|
||||
|
||||
Reference in New Issue
Block a user