From a577d38f9502eddc252baf8174701076e2bab864 Mon Sep 17 00:00:00 2001 From: xiezhongtao Date: Tue, 27 Jan 2026 13:54:05 +0800 Subject: [PATCH] =?UTF-8?q?feat(=E6=A8=A1=E5=9E=8B=E5=8A=A0=E8=BD=BD):=20?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0sync=5Fto=5Ftemp=E9=80=89=E9=A1=B9=E6=94=AF?= =?UTF-8?q?=E6=8C=81=E4=B8=B4=E6=97=B6=E7=9B=AE=E5=BD=95=E5=8A=A0=E8=BD=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 添加sync_to_temp参数控制是否将模型文件复制到临时目录后再加载 --- Dockerfile | 29 ++++++++++++++++++++++++ common/arg.cpp | 8 +++++++ common/common.cpp | 1 + common/common.h | 1 + include/llama.h | 1 + src/llama-model.cpp | 1 + src/llama.cpp | 54 ++++++++++++++++++++++++++++++++++++++++++++- 7 files changed, 94 insertions(+), 1 deletion(-) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..eb4a313 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,29 @@ +FROM corex:4.3.8 as builder +ADD . /app +WORKDIR /app +RUN sed -i 's/-x cuda/-x ivcore/g' /usr/local/share/cmake-3.25/Modules/Compiler/Clang-CUDA.cmake &&\ + cmake -S . -B build \ + -DGGML_CUDA=ON \ + -DLLAMA_CURL=OFF \ + -DCMAKE_CUDA_ARCHITECTURES=ivcore11 \ + -DCMAKE_CUDA_FLAGS="-x ivcore -std=c++17" \ + -DCMAKE_CUDA_STANDARD=17 \ + -DCMAKE_CXX_STANDARD=17 \ + -DGGML_CUDA_FA=OFF \ + -DBUILD_SHARED_LIBS=OFF \ + -DGGML_CUDA_FORCE_CUBLAS=ON \ + -DGGML_CPU=ON &&\ + cmake --build build --config Release -j \ + --target llama-server llama-cli + + + +FROM ubuntu +WORKDIR /app +COPY --from=builder /usr/local/corex/ /usr/local/corex/ +COPY --from=builder /usr/local/openmpi/lib/ /usr/local/openmpi/lib/ +COPY --from=builder /app/build/bin/llama-server /app/llama-server +COPY --from=builder /app/build/bin/llama-cli /app/llama-cli +RUN apt update && apt install -y libgomp1 +ENV PATH=/usr/local/corex/bin:/usr/local/openmpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin +ENV LD_LIBRARY_PATH=/usr/local/corex/lib64:/usr/local/openmpi/lib:/usr/local/lib diff --git a/common/arg.cpp b/common/arg.cpp index 1302065..7ae3aa5 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2039,6 +2039,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.use_mmap = value; } ).set_env("LLAMA_ARG_MMAP")); + add_opt(common_arg( + {"--sync-to-temp"}, + {"--no-sync-to-temp"}, + string_format("whether to copy model to temporary directory before loading (default: %s)", params.sync_to_temp ? "enabled" : "disabled"), + [](common_params & params, bool value) { + params.sync_to_temp = value; + } + ).set_env("LLAMA_ARG_SYNC_TO_TEMP")); add_opt(common_arg( {"--numa"}, "TYPE", "attempt optimizations that help on some NUMA systems\n" diff --git a/common/common.cpp b/common/common.cpp index acf2ec8..d8f333f 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1353,6 +1353,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { mparams.check_tensors = params.check_tensors; mparams.use_extra_bufts = !params.no_extra_bufts; mparams.no_host = params.no_host; + mparams.sync_to_temp = params.sync_to_temp; if (params.kv_overrides.empty()) { mparams.kv_overrides = NULL; diff --git a/common/common.h b/common/common.h index 3343720..0512841 100644 --- a/common/common.h +++ b/common/common.h @@ -430,6 +430,7 @@ struct common_params { bool no_op_offload = false; // globally disable offload host tensor operations to device bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking) bool no_host = false; // bypass host buffer allowing extra buffers to be used + bool sync_to_temp = false; // copy model to temporary directory before loading bool single_turn = false; // single turn chat conversation diff --git a/include/llama.h b/include/llama.h index f862930..259b95d 100644 --- a/include/llama.h +++ b/include/llama.h @@ -314,6 +314,7 @@ extern "C" { bool use_extra_bufts; // use extra buffer types (used for weight repacking) bool no_host; // bypass host buffer allowing extra buffers to be used bool no_alloc; // only load metadata and simulate memory allocations + bool sync_to_temp; // copy model to temporary directory before loading }; // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 0d5bcc6..b9bf42b 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -7743,6 +7743,7 @@ llama_model_params llama_model_default_params() { /*.use_extra_bufts =*/ true, /*.no_host =*/ false, /*.no_alloc =*/ false, + /*.sync_to_temp =*/ false, }; return result; diff --git a/src/llama.cpp b/src/llama.cpp index 1e18637..704acfc 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -19,9 +19,14 @@ #include #include #include +#include #include #include +#include +#include #include +#include + #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -940,7 +945,54 @@ static struct llama_model * llama_model_load_from_file_impl( props.memory_free/1024/1024); } - const int status = llama_model_load(path_model, splits, *model, params); + // Handle sync_to_temp option + std::string temp_model_path = path_model; + std::string temp_dir; + if (params.sync_to_temp) { + // Create temporary directory + char temp_dir_template[] = "/tmp/llama_model_XXXXXX"; + char * temp_dir_ptr = mkdtemp(temp_dir_template); + if (!temp_dir_ptr) { + LLAMA_LOG_ERROR("%s: failed to create temporary directory: %s\n", __func__, strerror(errno)); + llama_model_free(model); + return nullptr; + } + temp_dir = temp_dir_ptr; + + // Copy model file to temporary directory + std::string model_filename = std::filesystem::path(path_model).filename(); + temp_model_path = temp_dir + "/" + model_filename; + + try { + LLAMA_LOG_INFO("%s: copying model to temporary directory: %s\n", __func__, temp_model_path.c_str()); + std::filesystem::copy_file(path_model, temp_model_path, std::filesystem::copy_options::overwrite_existing); + + // Also copy split files if any + for (size_t i = 1; i < splits.size(); ++i) { + const std::string & split_path = splits[i]; + std::string split_filename = std::filesystem::path(split_path).filename(); + std::string temp_split_path = temp_dir + "/" + split_filename; + std::filesystem::copy_file(split_path, temp_split_path, std::filesystem::copy_options::overwrite_existing); + } + } catch (const std::exception & e) { + LLAMA_LOG_ERROR("%s: failed to copy model to temporary directory: %s\n", __func__, e.what()); + std::filesystem::remove_all(temp_dir); + llama_model_free(model); + return nullptr; + } + } + + const int status = llama_model_load(temp_model_path, splits, *model, params); + + // Clean up temporary directory if created + if (params.sync_to_temp && !temp_dir.empty()) { + try { + std::filesystem::remove_all(temp_dir); + LLAMA_LOG_INFO("%s: cleaned up temporary directory: %s\n", __func__, temp_dir.c_str()); + } catch (const std::exception & e) { + LLAMA_LOG_WARN("%s: failed to clean up temporary directory: %s\n", __func__, e.what()); + } + } GGML_ASSERT(status <= 0); if (status < 0) { if (status == -1) {