fix(ggml-cuda): 修正CUDA编译标志和WARP_SIZE配置
更新CUDA编译标志以使用正确的fast-math和extended-lambda选项 调整WARP_SIZE为64以适配目标硬件 移除-Wmissing-noreturn警告选项 修复cudaStreamWaitEvent调用缺少参数的问题
This commit is contained in:
@@ -1,3 +1,11 @@
|
|||||||
# enginex-bi_150-llama.cpp
|
# enginex-bi_150-llama.cpp
|
||||||
|
|
||||||
运行于【天数智芯-天垓150】算力卡的【文本生成】引擎,基于 llama.cpp (b7516) 引擎进行架构特别适配优化。
|
运行于【天数智芯-天垓150】算力卡的【文本生成】引擎,基于 llama.cpp (b7516) 引擎进行架构特别适配优化。
|
||||||
|
|
||||||
|
## Build Docker Image
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker build -t enginex-iluvatar/iluvatar-llama.cpp:b7516-bi150 .
|
||||||
|
```
|
||||||
|
|
||||||
|
最新镜像:git.modelhub.org.cn:9443/enginex-iluvatar/iluvatar-llama.cpp:b7516-bi150
|
||||||
@@ -39,11 +39,11 @@ if (GGML_ALL_WARNINGS)
|
|||||||
list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
|
list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
|
||||||
list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
|
list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
|
||||||
-Werror=implicit-int -Werror=implicit-function-declaration)
|
-Werror=implicit-int -Werror=implicit-function-declaration)
|
||||||
list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
|
list(APPEND CXX_FLAGS -Wmissing-declarations)
|
||||||
|
list(APPEND CXX_FLAGS -Wmissing-noreturn)
|
||||||
|
|
||||||
list(APPEND C_FLAGS ${WARNING_FLAGS})
|
list(APPEND C_FLAGS ${WARNING_FLAGS})
|
||||||
list(APPEND CXX_FLAGS ${WARNING_FLAGS})
|
list(APPEND CXX_FLAGS ${WARNING_FLAGS})
|
||||||
|
|
||||||
ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
|
ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
|
||||||
|
|
||||||
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
|
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
|
||||||
|
|||||||
@@ -120,7 +120,8 @@ if (CUDAToolkit_FOUND)
|
|||||||
|
|
||||||
set(CUDA_CXX_FLAGS "")
|
set(CUDA_CXX_FLAGS "")
|
||||||
|
|
||||||
set(CUDA_FLAGS -use_fast_math -extended-lambda)
|
set(CUDA_FLAGS -ffast-math --expt-extended-lambda)
|
||||||
|
# set(CUDA_FLAGS -use_fast_math -extended-lambda)
|
||||||
|
|
||||||
if (GGML_CUDA_DEBUG)
|
if (GGML_CUDA_DEBUG)
|
||||||
list(APPEND CUDA_FLAGS -lineinfo)
|
list(APPEND CUDA_FLAGS -lineinfo)
|
||||||
|
|||||||
@@ -40,12 +40,12 @@
|
|||||||
#define STRINGIZE_IMPL(...) #__VA_ARGS__
|
#define STRINGIZE_IMPL(...) #__VA_ARGS__
|
||||||
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
|
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
|
||||||
|
|
||||||
#define WARP_SIZE 32
|
#define WARP_SIZE 64
|
||||||
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
|
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
|
||||||
#define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons
|
#define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons
|
||||||
|
|
||||||
#define GGML_CUDA_CC_PASCAL 600
|
#define GGML_CUDA_CC_PASCAL 300
|
||||||
#define GGML_CUDA_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
#define GGML_CUDA_CC_DP4A 300 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
||||||
#define GGML_CUDA_CC_VOLTA 700
|
#define GGML_CUDA_CC_VOLTA 700
|
||||||
#define GGML_CUDA_CC_TURING 750
|
#define GGML_CUDA_CC_TURING 750
|
||||||
#define GGML_CUDA_CC_AMPERE 800
|
#define GGML_CUDA_CC_AMPERE 800
|
||||||
@@ -350,7 +350,8 @@ static __device__ void no_device_code(
|
|||||||
printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
|
printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
|
||||||
file_name, line, function_name, arch, arch_list);
|
file_name, line, function_name, arch, arch_list);
|
||||||
#endif // defined(GGML_USE_HIP)
|
#endif // defined(GGML_USE_HIP)
|
||||||
__trap();
|
// __trap();
|
||||||
|
__builtin_trap();
|
||||||
|
|
||||||
GGML_UNUSED(no_device_code); // suppress unused function warning
|
GGML_UNUSED(no_device_code); // suppress unused function warning
|
||||||
|
|
||||||
|
|||||||
@@ -3246,7 +3246,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
|||||||
|
|
||||||
for (int i = 1; i <= concurrent_event->n_streams; ++i) {
|
for (int i = 1; i <= concurrent_event->n_streams; ++i) {
|
||||||
cudaStream_t stream = cuda_ctx->stream(cuda_ctx->device, i);
|
cudaStream_t stream = cuda_ctx->stream(cuda_ctx->device, i);
|
||||||
CUDA_CHECK(cudaStreamWaitEvent(stream, concurrent_event->fork_event));
|
CUDA_CHECK(cudaStreamWaitEvent(stream, concurrent_event->fork_event,0));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -3327,7 +3327,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
|||||||
// Wait on join events of forked streams in the main stream
|
// Wait on join events of forked streams in the main stream
|
||||||
CUDA_CHECK(cudaEventRecord(concurrent_event->join_events[i - 1],
|
CUDA_CHECK(cudaEventRecord(concurrent_event->join_events[i - 1],
|
||||||
cuda_ctx->stream(cuda_ctx->device, i)));
|
cuda_ctx->stream(cuda_ctx->device, i)));
|
||||||
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), concurrent_event->join_events[i - 1]));
|
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), concurrent_event->join_events[i - 1],0));
|
||||||
}
|
}
|
||||||
|
|
||||||
is_concurrent_event_active = false;
|
is_concurrent_event_active = false;
|
||||||
|
|||||||
Reference in New Issue
Block a user