llama: use FA + max. GPU layers by default (#15434)
* llama: use max. GPU layers by default, auto -fa * ggml-backend: abort instead of segfault
This commit is contained in:
@@ -18994,7 +18994,7 @@ llama_model_params llama_model_default_params() {
|
||||
llama_model_params result = {
|
||||
/*.devices =*/ nullptr,
|
||||
/*.tensor_buft_overrides =*/ nullptr,
|
||||
/*.n_gpu_layers =*/ 0,
|
||||
/*.n_gpu_layers =*/ 999,
|
||||
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
||||
/*.main_gpu =*/ 0,
|
||||
/*.tensor_split =*/ nullptr,
|
||||
@@ -19008,11 +19008,6 @@ llama_model_params llama_model_default_params() {
|
||||
/*.use_extra_bufts =*/ true,
|
||||
};
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
// note: we usually have plenty of VRAM, so by default offload all layers to the GPU
|
||||
result.n_gpu_layers = 999;
|
||||
#endif
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user