feat(模型加载): 添加sync_to_temp选项支持临时目录加载

添加sync_to_temp参数控制是否将模型文件复制到临时目录后再加载
This commit is contained in:
2026-01-27 13:54:05 +08:00
parent 36a77d3318
commit a577d38f95
7 changed files with 94 additions and 1 deletions

29
Dockerfile Normal file
View File

@@ -0,0 +1,29 @@
FROM corex:4.3.8 as builder
ADD . /app
WORKDIR /app
RUN sed -i 's/-x cuda/-x ivcore/g' /usr/local/share/cmake-3.25/Modules/Compiler/Clang-CUDA.cmake &&\
cmake -S . -B build \
-DGGML_CUDA=ON \
-DLLAMA_CURL=OFF \
-DCMAKE_CUDA_ARCHITECTURES=ivcore11 \
-DCMAKE_CUDA_FLAGS="-x ivcore -std=c++17" \
-DCMAKE_CUDA_STANDARD=17 \
-DCMAKE_CXX_STANDARD=17 \
-DGGML_CUDA_FA=OFF \
-DBUILD_SHARED_LIBS=OFF \
-DGGML_CUDA_FORCE_CUBLAS=ON \
-DGGML_CPU=ON &&\
cmake --build build --config Release -j \
--target llama-server llama-cli
FROM ubuntu
WORKDIR /app
COPY --from=builder /usr/local/corex/ /usr/local/corex/
COPY --from=builder /usr/local/openmpi/lib/ /usr/local/openmpi/lib/
COPY --from=builder /app/build/bin/llama-server /app/llama-server
COPY --from=builder /app/build/bin/llama-cli /app/llama-cli
RUN apt update && apt install -y libgomp1
ENV PATH=/usr/local/corex/bin:/usr/local/openmpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
ENV LD_LIBRARY_PATH=/usr/local/corex/lib64:/usr/local/openmpi/lib:/usr/local/lib

View File

@@ -2039,6 +2039,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.use_mmap = value;
}
).set_env("LLAMA_ARG_MMAP"));
add_opt(common_arg(
{"--sync-to-temp"},
{"--no-sync-to-temp"},
string_format("whether to copy model to temporary directory before loading (default: %s)", params.sync_to_temp ? "enabled" : "disabled"),
[](common_params & params, bool value) {
params.sync_to_temp = value;
}
).set_env("LLAMA_ARG_SYNC_TO_TEMP"));
add_opt(common_arg(
{"--numa"}, "TYPE",
"attempt optimizations that help on some NUMA systems\n"

View File

@@ -1353,6 +1353,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
mparams.check_tensors = params.check_tensors;
mparams.use_extra_bufts = !params.no_extra_bufts;
mparams.no_host = params.no_host;
mparams.sync_to_temp = params.sync_to_temp;
if (params.kv_overrides.empty()) {
mparams.kv_overrides = NULL;

View File

@@ -430,6 +430,7 @@ struct common_params {
bool no_op_offload = false; // globally disable offload host tensor operations to device
bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking)
bool no_host = false; // bypass host buffer allowing extra buffers to be used
bool sync_to_temp = false; // copy model to temporary directory before loading
bool single_turn = false; // single turn chat conversation

View File

@@ -314,6 +314,7 @@ extern "C" {
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
bool no_host; // bypass host buffer allowing extra buffers to be used
bool no_alloc; // only load metadata and simulate memory allocations
bool sync_to_temp; // copy model to temporary directory before loading
};
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations

View File

@@ -7743,6 +7743,7 @@ llama_model_params llama_model_default_params() {
/*.use_extra_bufts =*/ true,
/*.no_host =*/ false,
/*.no_alloc =*/ false,
/*.sync_to_temp =*/ false,
};
return result;

View File

@@ -19,9 +19,14 @@
#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <ctime>
#include <cerrno>
#include <filesystem>
#include <stdexcept>
#include <unistd.h>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
@@ -940,7 +945,54 @@ static struct llama_model * llama_model_load_from_file_impl(
props.memory_free/1024/1024);
}
const int status = llama_model_load(path_model, splits, *model, params);
// Handle sync_to_temp option
std::string temp_model_path = path_model;
std::string temp_dir;
if (params.sync_to_temp) {
// Create temporary directory
char temp_dir_template[] = "/tmp/llama_model_XXXXXX";
char * temp_dir_ptr = mkdtemp(temp_dir_template);
if (!temp_dir_ptr) {
LLAMA_LOG_ERROR("%s: failed to create temporary directory: %s\n", __func__, strerror(errno));
llama_model_free(model);
return nullptr;
}
temp_dir = temp_dir_ptr;
// Copy model file to temporary directory
std::string model_filename = std::filesystem::path(path_model).filename();
temp_model_path = temp_dir + "/" + model_filename;
try {
LLAMA_LOG_INFO("%s: copying model to temporary directory: %s\n", __func__, temp_model_path.c_str());
std::filesystem::copy_file(path_model, temp_model_path, std::filesystem::copy_options::overwrite_existing);
// Also copy split files if any
for (size_t i = 1; i < splits.size(); ++i) {
const std::string & split_path = splits[i];
std::string split_filename = std::filesystem::path(split_path).filename();
std::string temp_split_path = temp_dir + "/" + split_filename;
std::filesystem::copy_file(split_path, temp_split_path, std::filesystem::copy_options::overwrite_existing);
}
} catch (const std::exception & e) {
LLAMA_LOG_ERROR("%s: failed to copy model to temporary directory: %s\n", __func__, e.what());
std::filesystem::remove_all(temp_dir);
llama_model_free(model);
return nullptr;
}
}
const int status = llama_model_load(temp_model_path, splits, *model, params);
// Clean up temporary directory if created
if (params.sync_to_temp && !temp_dir.empty()) {
try {
std::filesystem::remove_all(temp_dir);
LLAMA_LOG_INFO("%s: cleaned up temporary directory: %s\n", __func__, temp_dir.c_str());
} catch (const std::exception & e) {
LLAMA_LOG_WARN("%s: failed to clean up temporary directory: %s\n", __func__, e.what());
}
}
GGML_ASSERT(status <= 0);
if (status < 0) {
if (status == -1) {