llama : add support for control vectors (#5970)

* control vector api and implementation * control-vectors : minor code style updates * disable control vector when data == nullptr use -1 for disabled range (also on init) in case we ever support controlling layer 0 (embeddings) --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-15 13:43:02 -07:00 · 2024-03-15 13:43:02 -07:00 · 877b4d0c62
commit 877b4d0c62
parent 12247f4c69
4 changed files with 392 additions and 5 deletions
--- a/llama.h
+++ b/llama.h
@ -388,6 +388,7 @@ extern "C" {
    LLAMA_API int32_t llama_n_vocab    (const struct llama_model * model);
    LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
    LLAMA_API int32_t llama_n_embd     (const struct llama_model * model);
+    LLAMA_API int32_t llama_n_layer    (const struct llama_model * model);

    // Get the model's RoPE frequency scaling factor
    LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
@ -435,10 +436,24 @@ extern "C" {
    // Returns 0 on success
    LLAMA_API int32_t llama_model_apply_lora_from_file(
            const struct llama_model * model,
-                      const char * path_lora,
-                           float   scale,
-                      const char * path_base_model,
-                         int32_t   n_threads);
+                          const char * path_lora,
+                               float   scale,
+                          const char * path_base_model,
+                             int32_t   n_threads);
+
+    // Apply a loaded control vector to a llama_context, or if data is NULL, clear
+    // the currently loaded vector.
+    // n_embd should be the size of a single layer's control, and data should point
+    // to an n_embd x n_layers buffer starting from layer 1.
+    // il_start and il_end are the layer range the vector should apply to (both inclusive)
+    // See llama_control_vector_load in common to load a control vector.
+    LLAMA_API int32_t llama_control_vector_apply(
+            struct llama_context * lctx,
+                     const float * data,
+                          size_t   len,
+                         int32_t   n_embd,
+                         int32_t   il_start,
+                         int32_t   il_end);

    //
    // KV cache