将分散在各文件中的CUDA/HIP/MUSA硬件相关头文件引用统一到vendors目录下的对应头文件中,提高代码可维护性。移除重复的头文件引用,优化构建配置。
50 lines
1.6 KiB
C++
50 lines
1.6 KiB
C++
#pragma once
|
|
#include <torch/all.h>
|
|
#include <ATen/cuda/CUDAContext.h>
|
|
#include <c10/cuda/CUDAGuard.h>
|
|
|
|
#include <cuda_runtime.h>
|
|
#include <cuda.h>
|
|
#include <cublas_v2.h>
|
|
#include <cuda_bf16.h>
|
|
#include <cuda_fp16.h>
|
|
#include <c10/cuda/CUDAException.h> // For C10_CUDA_CHECK and C10_CUDA_KERNEL_LAUNCH_CHECK
|
|
#include <c10/util/BFloat16.h>
|
|
#include <c10/util/Half.h>
|
|
#include <cub/block/block_load.cuh>
|
|
#include <cub/block/block_store.cuh>
|
|
#include <cub/block/block_scan.cuh>
|
|
#include <cub/cub.cuh>
|
|
#include <cub/device/device_radix_sort.cuh>
|
|
#include <cub/util_type.cuh>
|
|
#include <c10/core/ScalarType.h>
|
|
#include <c10/cuda/CUDAStream.h>
|
|
#include <cuda/std/limits>
|
|
#include <cooperative_groups.h>
|
|
#include <cooperative_groups/reduce.h>
|
|
#include <cuda/annotated_ptr>
|
|
#include <c10/util/Float8_e4m3fn.h>
|
|
#include <c10/util/Float8_e4m3fnuz.h>
|
|
#include <ATen/cuda/CUDAContext.h>
|
|
#include <c10/cuda/CUDAGuard.h>
|
|
#include <c10/util/Optional.h>
|
|
#if CUDART_VERSION >= 12050
|
|
#include <cuda_fp8.h>
|
|
#endif // CUDART_VERSION >= 12050
|
|
|
|
#if CUDART_VERSION < 11020
|
|
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
|
|
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
|
|
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
|
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
|
#define cublasComputeType_t cudaDataType_t
|
|
#endif // CUDART_VERSION < 11020
|
|
|
|
#if CUB_VERSION >= 200800
|
|
#include <cuda/std/functional>
|
|
using CubAddOp = cuda::std::plus<>;
|
|
using CubMaxOp = cuda::maximum<>;
|
|
#else // if CUB_VERSION < 200800
|
|
using CubAddOp = cub::Sum;
|
|
using CubMaxOp = cub::Max;
|
|
#endif // CUB_VERSION
|