feat(模型加载): 添加sync_to_temp选项支持临时目录加载

添加sync_to_temp参数控制是否将模型文件复制到临时目录后再加载
2026-01-27 13:54:05 +08:00
parent 36a77d3318
commit a577d38f95
7 changed files with 94 additions and 1 deletions
--- a/29
+++ b/29
@@ -0,0 +1,29 @@
+FROM corex:4.3.8 as builder
+ADD . /app
+WORKDIR /app
+RUN sed -i 's/-x cuda/-x ivcore/g' /usr/local/share/cmake-3.25/Modules/Compiler/Clang-CUDA.cmake &&\
+    cmake -S . -B build \
+    -DGGML_CUDA=ON \
+    -DLLAMA_CURL=OFF \
+    -DCMAKE_CUDA_ARCHITECTURES=ivcore11 \
+    -DCMAKE_CUDA_FLAGS="-x ivcore -std=c++17" \
+    -DCMAKE_CUDA_STANDARD=17 \
+    -DCMAKE_CXX_STANDARD=17 \
+    -DGGML_CUDA_FA=OFF \
+    -DBUILD_SHARED_LIBS=OFF \
+    -DGGML_CUDA_FORCE_CUBLAS=ON \
+    -DGGML_CPU=ON &&\
+    cmake --build build --config Release -j \
+    --target llama-server llama-cli
+
+
+
+FROM ubuntu
+WORKDIR /app
+COPY --from=builder /usr/local/corex/ /usr/local/corex/
+COPY --from=builder /usr/local/openmpi/lib/ /usr/local/openmpi/lib/
+COPY --from=builder /app/build/bin/llama-server /app/llama-server
+COPY --from=builder /app/build/bin/llama-cli /app/llama-cli
+RUN apt update && apt install -y libgomp1
+ENV PATH=/usr/local/corex/bin:/usr/local/openmpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+ENV LD_LIBRARY_PATH=/usr/local/corex/lib64:/usr/local/openmpi/lib:/usr/local/lib
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2039,6 +2039,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.use_mmap = value;
        }
    ).set_env("LLAMA_ARG_MMAP"));
+    add_opt(common_arg(
+        {"--sync-to-temp"},
+        {"--no-sync-to-temp"},
+        string_format("whether to copy model to temporary directory before loading (default: %s)", params.sync_to_temp ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.sync_to_temp = value;
+        }
+    ).set_env("LLAMA_ARG_SYNC_TO_TEMP"));
    add_opt(common_arg(
        {"--numa"}, "TYPE",
        "attempt optimizations that help on some NUMA systems\n"
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1353,6 +1353,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
    mparams.check_tensors   = params.check_tensors;
    mparams.use_extra_bufts = !params.no_extra_bufts;
    mparams.no_host         = params.no_host;
+    mparams.sync_to_temp    = params.sync_to_temp;

    if (params.kv_overrides.empty()) {
        mparams.kv_overrides = NULL;
--- a/common/common.h
+++ b/common/common.h
@@ -430,6 +430,7 @@ struct common_params {
    bool no_op_offload     = false; // globally disable offload host tensor operations to device
    bool no_extra_bufts    = false; // disable extra buffer types (used for weight repacking)
    bool no_host           = false; // bypass host buffer allowing extra buffers to be used
+    bool sync_to_temp      = false; // copy model to temporary directory before loading

    bool single_turn       = false; // single turn chat conversation

--- a/include/llama.h
+++ b/include/llama.h
@@ -314,6 +314,7 @@ extern "C" {
        bool use_extra_bufts; // use extra buffer types (used for weight repacking)
        bool no_host;         // bypass host buffer allowing extra buffers to be used
        bool no_alloc;        // only load metadata and simulate memory allocations
+        bool sync_to_temp;    // copy model to temporary directory before loading
    };

    // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -7743,6 +7743,7 @@ llama_model_params llama_model_default_params() {
        /*.use_extra_bufts             =*/ true,
        /*.no_host                     =*/ false,
        /*.no_alloc                    =*/ false,
+        /*.sync_to_temp                =*/ false,
    };

    return result;
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -19,9 +19,14 @@
 #include <cstddef>
 #include <cstdint>
 #include <cstdio>
+#include <cstdlib>
 #include <cstring>
 #include <ctime>
+#include <cerrno>
+#include <filesystem>
 #include <stdexcept>
+#include <unistd.h>
+

 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -940,7 +945,54 @@ static struct llama_model * llama_model_load_from_file_impl(
                props.memory_free/1024/1024);
    }

-    const int status = llama_model_load(path_model, splits, *model, params);
+    // Handle sync_to_temp option
+    std::string temp_model_path = path_model;
+    std::string temp_dir;
+    if (params.sync_to_temp) {
+        // Create temporary directory
+        char temp_dir_template[] = "/tmp/llama_model_XXXXXX";
+        char * temp_dir_ptr = mkdtemp(temp_dir_template);
+        if (!temp_dir_ptr) {
+            LLAMA_LOG_ERROR("%s: failed to create temporary directory: %s\n", __func__, strerror(errno));
+            llama_model_free(model);
+            return nullptr;
+        }
+        temp_dir = temp_dir_ptr;
+        
+        // Copy model file to temporary directory
+        std::string model_filename = std::filesystem::path(path_model).filename();
+        temp_model_path = temp_dir + "/" + model_filename;
+        
+        try {
+            LLAMA_LOG_INFO("%s: copying model to temporary directory: %s\n", __func__, temp_model_path.c_str());
+            std::filesystem::copy_file(path_model, temp_model_path, std::filesystem::copy_options::overwrite_existing);
+            
+            // Also copy split files if any
+            for (size_t i = 1; i < splits.size(); ++i) {
+                const std::string & split_path = splits[i];
+                std::string split_filename = std::filesystem::path(split_path).filename();
+                std::string temp_split_path = temp_dir + "/" + split_filename;
+                std::filesystem::copy_file(split_path, temp_split_path, std::filesystem::copy_options::overwrite_existing);
+            }
+        } catch (const std::exception & e) {
+            LLAMA_LOG_ERROR("%s: failed to copy model to temporary directory: %s\n", __func__, e.what());
+            std::filesystem::remove_all(temp_dir);
+            llama_model_free(model);
+            return nullptr;
+        }
+    }
+
+    const int status = llama_model_load(temp_model_path, splits, *model, params);
+    
+    // Clean up temporary directory if created
+    if (params.sync_to_temp && !temp_dir.empty()) {
+        try {
+            std::filesystem::remove_all(temp_dir);
+            LLAMA_LOG_INFO("%s: cleaned up temporary directory: %s\n", __func__, temp_dir.c_str());
+        } catch (const std::exception & e) {
+            LLAMA_LOG_WARN("%s: failed to clean up temporary directory: %s\n", __func__, e.what());
+        }
+    }
    GGML_ASSERT(status <= 0);
    if (status < 0) {
        if (status == -1) {