llama : add option to override model tensor buffers (#11397)

* llama : add option to override tensor buffers

* ggml : fix possible underflow in ggml_nbytes
This commit is contained in:
Diego Devesa 2025-04-02 14:52:01 +02:00 committed by GitHub
parent a10b36c91a
commit e0e912f49b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 108 additions and 9 deletions

View file

@ -280,10 +280,18 @@ extern "C" {
};
};
struct llama_model_tensor_buft_override {
const char * pattern;
ggml_backend_buffer_type_t buft;
};
struct llama_model_params {
// NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
ggml_backend_dev_t * devices;
// NULL-terminated list of buffer types to use for tensors that match a pattern
const struct llama_model_tensor_buft_override * tensor_buft_overrides;
int32_t n_gpu_layers; // number of layers to store in VRAM
enum llama_split_mode split_mode; // how to split the model across multiple GPUs