From 8d3f9b9cb16a74abe7422af32847d6b76e4f740e Mon Sep 17 00:00:00 2001 From: xiezhongtao Date: Fri, 23 Jan 2026 16:42:43 +0800 Subject: [PATCH] =?UTF-8?q?fix(ggml-cuda):=20=E4=BF=AE=E6=AD=A3CUDA?= =?UTF-8?q?=E7=BC=96=E8=AF=91=E6=A0=87=E5=BF=97=E5=92=8CWARP=5FSIZE?= =?UTF-8?q?=E9=85=8D=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 更新CUDA编译标志以使用正确的fast-math和extended-lambda选项 调整WARP_SIZE为64以适配目标硬件 移除-Wmissing-noreturn警告选项 修复cudaStreamWaitEvent调用缺少参数的问题 --- README.md | 8 ++++++++ ggml/src/CMakeLists.txt | 4 ++-- ggml/src/ggml-cuda/CMakeLists.txt | 3 ++- ggml/src/ggml-cuda/common.cuh | 9 +++++---- ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++-- 5 files changed, 19 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 4044ace..c51168e 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,11 @@ # enginex-bi_150-llama.cpp 运行于【天数智芯-天垓150】算力卡的【文本生成】引擎,基于 llama.cpp (b7516) 引擎进行架构特别适配优化。 + +## Build Docker Image + +```bash +docker build -t enginex-iluvatar/iluvatar-llama.cpp:b7516-bi150 . +``` + +最新镜像:git.modelhub.org.cn:9443/enginex-iluvatar/iluvatar-llama.cpp:b7516-bi150 \ No newline at end of file diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 262d78a..be896fd 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -39,11 +39,11 @@ if (GGML_ALL_WARNINGS) list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function) list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration) - list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn) + list(APPEND CXX_FLAGS -Wmissing-declarations) + list(APPEND CXX_FLAGS -Wmissing-noreturn) list(APPEND C_FLAGS ${WARNING_FLAGS}) list(APPEND CXX_FLAGS ${WARNING_FLAGS}) - ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}) add_compile_options("$<$:${C_FLAGS};${GF_C_FLAGS}>" diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt index 67af1d8..0c2b822 100644 --- a/ggml/src/ggml-cuda/CMakeLists.txt +++ b/ggml/src/ggml-cuda/CMakeLists.txt @@ -120,7 +120,8 @@ if (CUDAToolkit_FOUND) set(CUDA_CXX_FLAGS "") - set(CUDA_FLAGS -use_fast_math -extended-lambda) + set(CUDA_FLAGS -ffast-math --expt-extended-lambda) + # set(CUDA_FLAGS -use_fast_math -extended-lambda) if (GGML_CUDA_DEBUG) list(APPEND CUDA_FLAGS -lineinfo) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 9fcb2f9..e37378c 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -40,12 +40,12 @@ #define STRINGIZE_IMPL(...) #__VA_ARGS__ #define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__) -#define WARP_SIZE 32 +#define WARP_SIZE 64 #define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed) #define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons -#define GGML_CUDA_CC_PASCAL 600 -#define GGML_CUDA_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products +#define GGML_CUDA_CC_PASCAL 300 +#define GGML_CUDA_CC_DP4A 300 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products #define GGML_CUDA_CC_VOLTA 700 #define GGML_CUDA_CC_TURING 750 #define GGML_CUDA_CC_AMPERE 800 @@ -350,7 +350,8 @@ static __device__ void no_device_code( printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n", file_name, line, function_name, arch, arch_list); #endif // defined(GGML_USE_HIP) - __trap(); + // __trap(); + __builtin_trap(); GGML_UNUSED(no_device_code); // suppress unused function warning diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 55fa2e6..187162b 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3246,7 +3246,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx for (int i = 1; i <= concurrent_event->n_streams; ++i) { cudaStream_t stream = cuda_ctx->stream(cuda_ctx->device, i); - CUDA_CHECK(cudaStreamWaitEvent(stream, concurrent_event->fork_event)); + CUDA_CHECK(cudaStreamWaitEvent(stream, concurrent_event->fork_event,0)); } } }; @@ -3327,7 +3327,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx // Wait on join events of forked streams in the main stream CUDA_CHECK(cudaEventRecord(concurrent_event->join_events[i - 1], cuda_ctx->stream(cuda_ctx->device, i))); - CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), concurrent_event->join_events[i - 1])); + CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), concurrent_event->join_events[i - 1],0)); } is_concurrent_event_active = false;