[1/2] Add Kernel support for Cutlass based Fused FP4 MoE (#6093)
Signed-off-by: Pavani Majety <pmajety@nvidia.com>
This commit is contained in:
@@ -232,6 +232,7 @@ void fp8_blockwise_scaled_grouped_mm(
|
||||
void prepare_moe_input(
|
||||
const torch::Tensor& topk_ids,
|
||||
torch::Tensor& expert_offsets,
|
||||
const std::optional<torch::Tensor>& blockscale_offsets,
|
||||
torch::Tensor& problem_sizes1,
|
||||
torch::Tensor& problem_sizes2,
|
||||
torch::Tensor& input_permutation,
|
||||
@@ -251,6 +252,29 @@ void ep_moe_pre_reorder(
|
||||
int64_t topk,
|
||||
bool use_per_token_if_dynamic);
|
||||
|
||||
void shuffle_rows(const torch::Tensor& input_tensor, const torch::Tensor& dst2src_map, torch::Tensor& output_tensor);
|
||||
|
||||
void cutlass_fp4_group_mm(
|
||||
torch::Tensor& output,
|
||||
const torch::Tensor& a,
|
||||
const torch::Tensor& b,
|
||||
const torch::Tensor& a_blockscale,
|
||||
const torch::Tensor& b_blockscales,
|
||||
const torch::Tensor& alphas,
|
||||
const torch::Tensor& ab_strides,
|
||||
const torch::Tensor& c_strides,
|
||||
const torch::Tensor& problem_sizes,
|
||||
const torch::Tensor& expert_offsets,
|
||||
const torch::Tensor& sf_offsets);
|
||||
|
||||
void scaled_fp4_experts_quant(
|
||||
torch::Tensor& output,
|
||||
torch::Tensor& output_scale,
|
||||
torch::Tensor const& input,
|
||||
torch::Tensor const& input_global_scale,
|
||||
torch::Tensor const& input_offset_by_experts,
|
||||
torch::Tensor const& output_scale_offset_by_experts);
|
||||
|
||||
/*
|
||||
* From csrc/speculative
|
||||
*/
|
||||
|
||||
Reference in New Issue
Block a user