fix(ggml-cuda): 修正CUDA编译标志和WARP_SIZE配置

更新CUDA编译标志以使用正确的fast-math和extended-lambda选项调整WARP_SIZE为64以适配目标硬件移除-Wmissing-noreturn警告选项修复cudaStreamWaitEvent调用缺少参数的问题
2026-01-23 16:42:43 +08:00
parent b1cf23ae3e
commit 8d3f9b9cb1
5 changed files with 19 additions and 9 deletions
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -40,12 +40,12 @@
 #define STRINGIZE_IMPL(...) #__VA_ARGS__
 #define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)

-#define WARP_SIZE 32
+#define WARP_SIZE 64
 #define CUDART_HMAX   11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
 #define CUDART_HMASK  12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons

-#define GGML_CUDA_CC_PASCAL          600
-#define GGML_CUDA_CC_DP4A            610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
+#define GGML_CUDA_CC_PASCAL          300
+#define GGML_CUDA_CC_DP4A            300 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
 #define GGML_CUDA_CC_VOLTA           700
 #define GGML_CUDA_CC_TURING          750
 #define GGML_CUDA_CC_AMPERE          800
@@ -350,7 +350,8 @@ static __device__ void no_device_code(
    printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
           file_name, line, function_name, arch, arch_list);
 #endif // defined(GGML_USE_HIP)
-    __trap();
+    // __trap();
+    __builtin_trap();

    GGML_UNUSED(no_device_code); // suppress unused function warning