Optimize moe align block size kernel (#7794)

This commit is contained in:
Ke Bao
2025-07-07 09:20:30 +08:00
committed by GitHub
parent ba69c153f6
commit a3398d8478
2 changed files with 104 additions and 67 deletions

View File

@@ -363,3 +363,9 @@ inline torch::Tensor pad_tensor(const torch::Tensor& tensor, int64_t alignment =
}
return tensor_padded;
}
// Get the next power of 2 of a number
inline uint32_t next_pow2(uint32_t x) noexcept {
if (x <= 1) return 1;
return 1u << (32 - __builtin_clz(x - 1));
}