CUDA: General GEMV fusion (#16715)

This commit is contained in:
Aman Gupta
2025-10-26 19:28:04 +08:00
committed by GitHub
parent 3cfa9c3f12
commit f77c13b91f
11 changed files with 1096 additions and 166 deletions

View File

@@ -810,6 +810,9 @@ ggml_tensor * llm_graph_context::build_ffn(
GGML_ABORT("fatal error");
}
//expand here so that we can fuse ffn gate
ggml_build_forward_expand(gf, cur);
if (gate && type_gate == LLM_FFN_PAR) {
cur = ggml_mul(ctx0, cur, tmp);
cb(cur, "ffn_gate_par", il);
@@ -1091,6 +1094,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
GGML_ABORT("fatal error");
}
//expand here so that we can fuse ffn gate
ggml_build_forward_expand(gf, cur);
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
cb(experts, "ffn_moe_down", il);