From a577d38f9502eddc252baf8174701076e2bab864 Mon Sep 17 00:00:00 2001
From: xiezhongtao <i-xiezhongtao@4paradigm.com>
Date: Tue, 27 Jan 2026 13:54:05 +0800
Subject: [PATCH] =?UTF-8?q?feat(=E6=A8=A1=E5=9E=8B=E5=8A=A0=E8=BD=BD):=20?=
 =?UTF-8?q?=E6=B7=BB=E5=8A=A0sync=5Fto=5Ftemp=E9=80=89=E9=A1=B9=E6=94=AF?=
 =?UTF-8?q?=E6=8C=81=E4=B8=B4=E6=97=B6=E7=9B=AE=E5=BD=95=E5=8A=A0=E8=BD=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

添加sync_to_temp参数控制是否将模型文件复制到临时目录后再加载
---
 Dockerfile          | 29 ++++++++++++++++++++++++
 common/arg.cpp      |  8 +++++++
 common/common.cpp   |  1 +
 common/common.h     |  1 +
 include/llama.h     |  1 +
 src/llama-model.cpp |  1 +
 src/llama.cpp       | 54 ++++++++++++++++++++++++++++++++++++++++++++-
 7 files changed, 94 insertions(+), 1 deletion(-)
 create mode 100644 Dockerfile

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..eb4a313
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,29 @@
+FROM corex:4.3.8 as builder
+ADD . /app
+WORKDIR /app
+RUN sed -i 's/-x cuda/-x ivcore/g' /usr/local/share/cmake-3.25/Modules/Compiler/Clang-CUDA.cmake &&\
+    cmake -S . -B build \
+    -DGGML_CUDA=ON \
+    -DLLAMA_CURL=OFF \
+    -DCMAKE_CUDA_ARCHITECTURES=ivcore11 \
+    -DCMAKE_CUDA_FLAGS="-x ivcore -std=c++17" \
+    -DCMAKE_CUDA_STANDARD=17 \
+    -DCMAKE_CXX_STANDARD=17 \
+    -DGGML_CUDA_FA=OFF \
+    -DBUILD_SHARED_LIBS=OFF \
+    -DGGML_CUDA_FORCE_CUBLAS=ON \
+    -DGGML_CPU=ON &&\
+    cmake --build build --config Release -j \
+    --target llama-server llama-cli
+
+
+
+FROM ubuntu
+WORKDIR /app
+COPY --from=builder /usr/local/corex/ /usr/local/corex/
+COPY --from=builder /usr/local/openmpi/lib/ /usr/local/openmpi/lib/
+COPY --from=builder /app/build/bin/llama-server /app/llama-server
+COPY --from=builder /app/build/bin/llama-cli /app/llama-cli
+RUN apt update && apt install -y libgomp1
+ENV PATH=/usr/local/corex/bin:/usr/local/openmpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+ENV LD_LIBRARY_PATH=/usr/local/corex/lib64:/usr/local/openmpi/lib:/usr/local/lib
diff --git a/common/arg.cpp b/common/arg.cpp
index 1302065..7ae3aa5 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2039,6 +2039,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.use_mmap = value;
         }
     ).set_env("LLAMA_ARG_MMAP"));
+    add_opt(common_arg(
+        {"--sync-to-temp"},
+        {"--no-sync-to-temp"},
+        string_format("whether to copy model to temporary directory before loading (default: %s)", params.sync_to_temp ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.sync_to_temp = value;
+        }
+    ).set_env("LLAMA_ARG_SYNC_TO_TEMP"));
     add_opt(common_arg(
         {"--numa"}, "TYPE",
         "attempt optimizations that help on some NUMA systems\n"
diff --git a/common/common.cpp b/common/common.cpp
index acf2ec8..d8f333f 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1353,6 +1353,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
     mparams.check_tensors   = params.check_tensors;
     mparams.use_extra_bufts = !params.no_extra_bufts;
     mparams.no_host         = params.no_host;
+    mparams.sync_to_temp    = params.sync_to_temp;
 
     if (params.kv_overrides.empty()) {
         mparams.kv_overrides = NULL;
diff --git a/common/common.h b/common/common.h
index 3343720..0512841 100644
--- a/common/common.h
+++ b/common/common.h
@@ -430,6 +430,7 @@ struct common_params {
     bool no_op_offload     = false; // globally disable offload host tensor operations to device
     bool no_extra_bufts    = false; // disable extra buffer types (used for weight repacking)
     bool no_host           = false; // bypass host buffer allowing extra buffers to be used
+    bool sync_to_temp      = false; // copy model to temporary directory before loading
 
     bool single_turn       = false; // single turn chat conversation
 
diff --git a/include/llama.h b/include/llama.h
index f862930..259b95d 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -314,6 +314,7 @@ extern "C" {
         bool use_extra_bufts; // use extra buffer types (used for weight repacking)
         bool no_host;         // bypass host buffer allowing extra buffers to be used
         bool no_alloc;        // only load metadata and simulate memory allocations
+        bool sync_to_temp;    // copy model to temporary directory before loading
     };
 
     // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 0d5bcc6..b9bf42b 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -7743,6 +7743,7 @@ llama_model_params llama_model_default_params() {
         /*.use_extra_bufts             =*/ true,
         /*.no_host                     =*/ false,
         /*.no_alloc                    =*/ false,
+        /*.sync_to_temp                =*/ false,
     };
 
     return result;
diff --git a/src/llama.cpp b/src/llama.cpp
index 1e18637..704acfc 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -19,9 +19,14 @@
 #include <cstddef>
 #include <cstdint>
 #include <cstdio>
+#include <cstdlib>
 #include <cstring>
 #include <ctime>
+#include <cerrno>
+#include <filesystem>
 #include <stdexcept>
+#include <unistd.h>
+
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -940,7 +945,54 @@ static struct llama_model * llama_model_load_from_file_impl(
                 props.memory_free/1024/1024);
     }
 
-    const int status = llama_model_load(path_model, splits, *model, params);
+    // Handle sync_to_temp option
+    std::string temp_model_path = path_model;
+    std::string temp_dir;
+    if (params.sync_to_temp) {
+        // Create temporary directory
+        char temp_dir_template[] = "/tmp/llama_model_XXXXXX";
+        char * temp_dir_ptr = mkdtemp(temp_dir_template);
+        if (!temp_dir_ptr) {
+            LLAMA_LOG_ERROR("%s: failed to create temporary directory: %s\n", __func__, strerror(errno));
+            llama_model_free(model);
+            return nullptr;
+        }
+        temp_dir = temp_dir_ptr;
+        
+        // Copy model file to temporary directory
+        std::string model_filename = std::filesystem::path(path_model).filename();
+        temp_model_path = temp_dir + "/" + model_filename;
+        
+        try {
+            LLAMA_LOG_INFO("%s: copying model to temporary directory: %s\n", __func__, temp_model_path.c_str());
+            std::filesystem::copy_file(path_model, temp_model_path, std::filesystem::copy_options::overwrite_existing);
+            
+            // Also copy split files if any
+            for (size_t i = 1; i < splits.size(); ++i) {
+                const std::string & split_path = splits[i];
+                std::string split_filename = std::filesystem::path(split_path).filename();
+                std::string temp_split_path = temp_dir + "/" + split_filename;
+                std::filesystem::copy_file(split_path, temp_split_path, std::filesystem::copy_options::overwrite_existing);
+            }
+        } catch (const std::exception & e) {
+            LLAMA_LOG_ERROR("%s: failed to copy model to temporary directory: %s\n", __func__, e.what());
+            std::filesystem::remove_all(temp_dir);
+            llama_model_free(model);
+            return nullptr;
+        }
+    }
+
+    const int status = llama_model_load(temp_model_path, splits, *model, params);
+    
+    // Clean up temporary directory if created
+    if (params.sync_to_temp && !temp_dir.empty()) {
+        try {
+            std::filesystem::remove_all(temp_dir);
+            LLAMA_LOG_INFO("%s: cleaned up temporary directory: %s\n", __func__, temp_dir.c_str());
+        } catch (const std::exception & e) {
+            LLAMA_LOG_WARN("%s: failed to clean up temporary directory: %s\n", __func__, e.what());
+        }
+    }
     GGML_ASSERT(status <= 0);
     if (status < 0) {
         if (status == -1) {