refactor: 统一硬件相关头文件引用

将分散在各文件中的CUDA/HIP/MUSA硬件相关头文件引用统一到vendors目录下的对应头文件中，提高代码可维护性。移除重复的头文件引用，优化构建配置。
2026-01-20 10:14:31 +08:00
parent 5aef6c175a
commit 2bd9bd4cc2
98 changed files with 1757 additions and 1286 deletions
--- a/csrc/moe/dynamic_4bit_int_moe_cpu.cpp
+++ b/csrc/moe/dynamic_4bit_int_moe_cpu.cpp
@@ -1,7 +1,4 @@
-#include <ATen/ATen.h>
-#include <ATen/Parallel.h>
-#include <torch/all.h>
-
+#include "../../vendors/functions.h"
 // _dyn_quant_matmul_4bit is only available on AArch64.
 #if defined(__aarch64__)
  #include <ATen/ops/_dyn_quant_matmul_4bit.h>
--- a/csrc/moe/grouped_topk_kernels.cu
+++ b/csrc/moe/grouped_topk_kernels.cu
@@ -17,13 +17,8 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#include <c10/cuda/CUDAStream.h>
-#include <torch/all.h>
-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
-#include <cuda/std/limits>
-#include <cooperative_groups.h>
-#include <cooperative_groups/reduce.h>
+
+#include "../../vendors/functions.h"
 namespace cg = cooperative_groups;

 namespace vllm {
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -1,10 +1,4 @@
-#include <torch/all.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <cub/cub.cuh>
-
-#include <ATen/ATen.h>
-#include <ATen/cuda/Atomic.cuh>
+#include "../vendors/functions.h"

 #include "../cuda_compat.h"
 #include "../dispatch_utils.h"
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -1,6 +1,6 @@
 #pragma once

-#include <torch/all.h>
+#include "../vendors/functions.h"

 void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
                  torch::Tensor& token_expert_indices,
--- a/csrc/moe/moe_permute_unpermute_op.cu
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@@ -1,6 +1,4 @@
-#include <c10/core/ScalarType.h>
-#include <torch/all.h>
-#include <ATen/cuda/CUDAContext.h>
+#include "../vendors/functions.h"
 #include "permute_unpermute_kernels/moe_permute_unpermute_kernel.h"
 #include "permute_unpermute_kernels/dispatch.h"
 #include "core/registration.h"
--- a/csrc/moe/moe_wna16.cu
+++ b/csrc/moe/moe_wna16.cu
@@ -1,11 +1,7 @@

-#include <torch/all.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <cuda_runtime.h>
+#include "../vendors/functions.h"
+

-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
 #include "moe_wna16_utils.h"

 #define DIVIDE(x, size) (((x) + (size) - 1) / (size))
--- a/csrc/moe/moe_wna16_utils.h
+++ b/csrc/moe/moe_wna16_utils.h
@@ -1,6 +1,5 @@

-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
+#include "../vendors/functions.h"

 template <typename scalar_t>
 class ScalarType {};
--- a/csrc/moe/permute_unpermute_kernels/dispatch.h
+++ b/csrc/moe/permute_unpermute_kernels/dispatch.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <cuda_fp8.h>
+#include "vendors/functions.h"
 #define MOE_SWITCH(TYPE, ...)                                     \
  at::ScalarType _st = ::detail::scalar_type(TYPE);               \
  switch (_st) {                                                  \
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
@@ -2,12 +2,11 @@
 // reference from tensorrt_llm moe kernel implementation archive in
 // https://github.com/BBuf/tensorrt-llm-moe/tree/master

-#include <c10/core/ScalarType.h>
 #include <torch/all.h>
 #include "dispatch.h"
-#include <cub/cub.cuh>
-#include <cub/device/device_radix_sort.cuh>
-#include <cub/util_type.cuh>
+
+
+#include "../../vendors/functions.h"
 #include "cutlass/numeric_size.h"
 #include "cutlass/array.h"

--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@@ -17,9 +17,9 @@
 * limitations under the License.
 */
 #include <type_traits>
-#include <torch/all.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
+#include "../vendors/functions.h"
+
+
 #include "../cuda_compat.h"
 #include "../cub_helpers.h"