Sync with upstream ggml-org/llama.cpp tag b7751

This commit is contained in:
2026-01-16 14:21:48 +08:00
parent 8f29820a88
commit a2efec7ed6
2053 changed files with 956034 additions and 2 deletions

8
tools/rpc/CMakeLists.txt Normal file
View File

@@ -0,0 +1,8 @@
set(TARGET rpc-server)
add_executable(${TARGET} rpc-server.cpp)
target_link_libraries(${TARGET} PRIVATE ggml)
target_compile_features(${TARGET} PRIVATE cxx_std_17)
if(LLAMA_TOOLS_INSTALL)
install(TARGETS ${TARGET} RUNTIME)
endif()

104
tools/rpc/README.md Normal file
View File

@@ -0,0 +1,104 @@
## Overview
> [!IMPORTANT]
> This example and the RPC backend are currently in a proof-of-concept development stage. As such, the functionality is fragile and
> insecure. **Never run the RPC server on an open network or in a sensitive environment!**
The `rpc-server` allows exposing `ggml` devices on a remote host.
The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them.
This can be used for distributed LLM inference with `llama.cpp` in the following way:
```mermaid
flowchart TD
rpcb<-->|TCP|srva
rpcb<-->|TCP|srvb
rpcb<-.->|TCP|srvn
subgraph hostn[Host N]
srvn[rpc-server]<-.->dev4["CUDA0"]
srvn[rpc-server]<-.->dev5["CPU"]
end
subgraph hostb[Host B]
srvb[rpc-server]<-->dev3["Metal"]
end
subgraph hosta[Host A]
srva[rpc-server]<-->dev["CUDA0"]
srva[rpc-server]<-->dev2["CUDA1"]
end
subgraph host[Main Host]
local["Local devices"]<-->ggml[llama-cli]
ggml[llama-cli]<-->rpcb[RPC backend]
end
style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5
classDef devcls fill:#5B9BD5
class local,dev,dev2,dev3,dev4,dev5 devcls
```
By default, `rpc-server` exposes all available accelerator devices on the host.
If there are no accelerators, it exposes a single `CPU` device.
## Usage
### Remote hosts
On each remote host, build the backends for each accelerator by adding `-DGGML_RPC=ON` to the build options.
For example, to build the `rpc-server` with support for CUDA accelerators:
```bash
mkdir build-rpc-cuda
cd build-rpc-cuda
cmake .. -DGGML_CUDA=ON -DGGML_RPC=ON
cmake --build . --config Release
```
When started, the `rpc-server` will detect and expose all available `CUDA` devices:
```bash
$ bin/rpc-server
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 1 CUDA devices:
Device 0: NVIDIA GeForce RTX 5090, compute capability 12.0, VMM: yes
Starting RPC server v3.0.0
endpoint : 127.0.0.1:50052
local cache : n/a
Devices:
CUDA0: NVIDIA GeForce RTX 5090 (32109 MiB, 31588 MiB free)
```
You can control the set of exposed CUDA devices with the `CUDA_VISIBLE_DEVICES` environment variable or the `--device` command line option. The following two commands have the same effect:
```bash
$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
$ bin/rpc-server --device CUDA0 -p 50052
```
### Main host
On the main host build `llama.cpp` with the backends for the local devices and add `-DGGML_RPC=ON` to the build options.
Finally, when running `llama-cli` or `llama-server`, use the `--rpc` option to specify the host and port of each `rpc-server`:
```bash
$ llama-cli -hf ggml-org/gemma-3-1b-it-GGUF -ngl 99 --rpc 192.168.88.10:50052,192.168.88.11:50052
```
By default, llama.cpp distributes model weights and the KV cache across all available devices -- both local and remote -- in proportion to each device's available memory.
You can override this behavior with the `--tensor-split` option and set custom proportions when splitting tensor data across devices.
### Local cache
The RPC server can use a local cache to store large tensors and avoid transferring them over the network.
This can speed up model loading significantly, especially when using large models.
To enable the cache, use the `-c` option:
```bash
$ bin/rpc-server -c
```
By default, the cache is stored in the `$HOME/.cache/llama.cpp/rpc` directory and can be controlled via the `LLAMA_CACHE` environment variable.
### Troubleshooting
Use the `GGML_RPC_DEBUG` environment variable to enable debug messages from `rpc-server`:
```bash
$ GGML_RPC_DEBUG=1 bin/rpc-server
```

302
tools/rpc/rpc-server.cpp Normal file
View File

@@ -0,0 +1,302 @@
#if defined(_MSC_VER)
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
#endif
#include "ggml-rpc.h"
#ifdef _WIN32
# define NOMINMAX
# define DIRECTORY_SEPARATOR '\\'
# include <locale>
# include <windows.h>
# include <fcntl.h>
# include <io.h>
#else
# define DIRECTORY_SEPARATOR '/'
# include <unistd.h>
# include <sys/stat.h>
#endif
#include <codecvt>
#include <string>
#include <stdio.h>
#include <vector>
#include <filesystem>
#include <algorithm>
#include <thread>
#include <regex>
namespace fs = std::filesystem;
// NOTE: this is copied from common.cpp to avoid linking with libcommon
// returns true if successful, false otherwise
static bool fs_create_directory_with_parents(const std::string & path) {
#ifdef _WIN32
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
std::wstring wpath = converter.from_bytes(path);
// if the path already exists, check whether it's a directory
const DWORD attributes = GetFileAttributesW(wpath.c_str());
if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
return true;
}
size_t pos_slash = 0;
// process path from front to back, procedurally creating directories
while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
const std::wstring subpath = wpath.substr(0, pos_slash);
const wchar_t * test = subpath.c_str();
const bool success = CreateDirectoryW(test, NULL);
if (!success) {
const DWORD error = GetLastError();
// if the path already exists, ensure that it's a directory
if (error == ERROR_ALREADY_EXISTS) {
const DWORD attributes = GetFileAttributesW(subpath.c_str());
if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
return false;
}
} else {
return false;
}
}
pos_slash += 1;
}
return true;
#else
// if the path already exists, check whether it's a directory
struct stat info;
if (stat(path.c_str(), &info) == 0) {
return S_ISDIR(info.st_mode);
}
size_t pos_slash = 1; // skip leading slashes for directory creation
// process path from front to back, procedurally creating directories
while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
const std::string subpath = path.substr(0, pos_slash);
struct stat info;
// if the path already exists, ensure that it's a directory
if (stat(subpath.c_str(), &info) == 0) {
if (!S_ISDIR(info.st_mode)) {
return false;
}
} else {
// create parent directories
const int ret = mkdir(subpath.c_str(), 0755);
if (ret != 0) {
return false;
}
}
pos_slash += 1;
}
return true;
#endif // _WIN32
}
// NOTE: this is copied from common.cpp to avoid linking with libcommon
static std::string fs_get_cache_directory() {
std::string cache_directory = "";
auto ensure_trailing_slash = [](std::string p) {
// Make sure to add trailing slash
if (p.back() != DIRECTORY_SEPARATOR) {
p += DIRECTORY_SEPARATOR;
}
return p;
};
if (getenv("LLAMA_CACHE")) {
cache_directory = std::getenv("LLAMA_CACHE");
} else {
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
if (std::getenv("XDG_CACHE_HOME")) {
cache_directory = std::getenv("XDG_CACHE_HOME");
} else {
cache_directory = std::getenv("HOME") + std::string("/.cache/");
}
#elif defined(__APPLE__)
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
#elif defined(_WIN32)
cache_directory = std::getenv("LOCALAPPDATA");
#else
# error Unknown architecture
#endif
cache_directory = ensure_trailing_slash(cache_directory);
cache_directory += "llama.cpp";
}
return ensure_trailing_slash(cache_directory);
}
struct rpc_server_params {
std::string host = "127.0.0.1";
int port = 50052;
bool use_cache = false;
int n_threads = std::max(1U, std::thread::hardware_concurrency()/2);
std::vector<std::string> devices;
};
static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
fprintf(stderr, "Usage: %s [options]\n\n", argv[0]);
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " -t, --threads N number of threads for the CPU device (default: %d)\n", params.n_threads);
fprintf(stderr, " -d, --device <dev1,dev2,...> comma-separated list of devices\n");
fprintf(stderr, " -H, --host HOST host to bind to (default: %s)\n", params.host.c_str());
fprintf(stderr, " -p, --port PORT port to bind to (default: %d)\n", params.port);
fprintf(stderr, " -c, --cache enable local file cache\n");
fprintf(stderr, "\n");
}
static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params & params) {
std::string arg;
for (int i = 1; i < argc; i++) {
arg = argv[i];
if (arg == "-H" || arg == "--host") {
if (++i >= argc) {
return false;
}
params.host = argv[i];
} else if (arg == "-t" || arg == "--threads") {
if (++i >= argc) {
return false;
}
params.n_threads = std::stoi(argv[i]);
if (params.n_threads <= 0) {
fprintf(stderr, "error: invalid number of threads: %d\n", params.n_threads);
return false;
}
} else if (arg == "-d" || arg == "--device") {
if (++i >= argc) {
return false;
}
const std::regex regex{ R"([,/]+)" };
std::string dev_str = argv[i];
std::sregex_token_iterator iter(dev_str.begin(), dev_str.end(), regex, -1);
std::sregex_token_iterator end;
for ( ; iter != end; ++iter) {
try {
params.devices.push_back(*iter);
} catch (const std::exception & ) {
fprintf(stderr, "error: invalid device: %s\n", iter->str().c_str());
return false;
}
}
} else if (arg == "-p" || arg == "--port") {
if (++i >= argc) {
return false;
}
params.port = std::stoi(argv[i]);
if (params.port <= 0 || params.port > 65535) {
return false;
}
} else if (arg == "-c" || arg == "--cache") {
params.use_cache = true;
} else if (arg == "-h" || arg == "--help") {
print_usage(argc, argv, params);
exit(0);
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
print_usage(argc, argv, params);
exit(0);
}
}
return true;
}
static std::vector<ggml_backend_dev_t> get_devices(const rpc_server_params & params) {
std::vector<ggml_backend_dev_t> devices;
if (!params.devices.empty()) {
for (auto device : params.devices) {
ggml_backend_dev_t dev = ggml_backend_dev_by_name(device.c_str());
if (dev) {
devices.push_back(dev);
} else {
fprintf(stderr, "error: unknown device: %s\n", device.c_str());
fprintf(stderr, "available devices:\n");
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
auto * dev = ggml_backend_dev_get(i);
size_t free, total;
ggml_backend_dev_memory(dev, &free, &total);
printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
}
return {};
}
}
}
// Try non-CPU devices first
if (devices.empty()) {
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
devices.push_back(dev);
}
}
}
// If there are no accelerators, fallback to CPU device
if (devices.empty()) {
ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (dev) {
devices.push_back(dev);
}
}
return devices;
}
int main(int argc, char * argv[]) {
ggml_backend_load_all();
rpc_server_params params;
if (!rpc_server_params_parse(argc, argv, params)) {
fprintf(stderr, "Invalid parameters\n");
return 1;
}
if (params.host != "127.0.0.1") {
fprintf(stderr, "\n");
fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
fprintf(stderr, "WARNING: Host ('%s') is != '127.0.0.1'\n", params.host.c_str());
fprintf(stderr, " Never expose the RPC server to an open network!\n");
fprintf(stderr, " This is an experimental feature and is not secure!\n");
fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
fprintf(stderr, "\n");
}
auto devices = get_devices(params);
if (devices.empty()) {
fprintf(stderr, "No devices found\n");
return 1;
}
std::string endpoint = params.host + ":" + std::to_string(params.port);
const char * cache_dir = nullptr;
std::string cache_dir_str;
if (params.use_cache) {
cache_dir_str = fs_get_cache_directory() + "rpc/";
if (!fs_create_directory_with_parents(cache_dir_str)) {
fprintf(stderr, "Failed to create cache directory: %s\n", cache_dir_str.c_str());
return 1;
}
cache_dir = cache_dir_str.c_str();
}
ggml_backend_reg_t reg = ggml_backend_reg_by_name("RPC");
if (!reg) {
fprintf(stderr, "Failed to find RPC backend\n");
return 1;
}
auto start_server_fn = (decltype(ggml_backend_rpc_start_server)*) ggml_backend_reg_get_proc_address(reg, "ggml_backend_rpc_start_server");
if (!start_server_fn) {
fprintf(stderr, "Failed to obtain RPC backend start server function\n");
return 1;
}
start_server_fn(endpoint.c_str(), cache_dir, params.n_threads, devices.size(), devices.data());
return 0;
}