vulkan: handle mat_mul with A matrix > 4GB (#16176)

* vulkan: handle mat_mul with A matrix > 4GB

This change splits mat_mul operations with huge A matrix into chunks in the M
dimension. This works well for stable-diffusion use cases where the im2col
matrix has very large M.

Fix the order of setting the stride in mul_mm_cm2 - setting the dimension
clobbers the stride, so stride should be set after.

* build fixes
This commit is contained in:
Jeff Bolz
2025-09-27 20:36:34 -05:00
committed by GitHub
parent e6d65fb02d
commit 1384abf8b8
3 changed files with 59 additions and 15 deletions

View File

@@ -265,7 +265,6 @@ void main() {
tensorLayoutNV<2> tensorLayoutB = createTensorLayoutNV(2);
tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutBClamp = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutD = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
tensorLayoutD = setTensorLayoutStrideNV(tensorLayoutD, p.stride_d, 1);
#if QUANT_K > 1
tensorLayoutA = setTensorLayoutBlockSizeNV(tensorLayoutA, 1, QUANT_K);
@@ -281,6 +280,8 @@ void main() {
tensorLayoutAClamp = setTensorLayoutDimensionNV(tensorLayoutAClamp, p.M, end_k);
tensorLayoutBClamp = setTensorLayoutDimensionNV(tensorLayoutBClamp, p.padded_N, end_k);
tensorLayoutD = setTensorLayoutStrideNV(tensorLayoutD, p.stride_d, 1);
tensorViewNV<2, false, 1, 0> tensorViewTranspose = createTensorViewNV(2, false, 1, 0);
#if !defined(MUL_MAT_ID)