llama : improve sep token handling (#14272)

2025-06-20 14:04:09 +02:00 · 2025-06-20 14:04:09 +02:00 · 88fc854b4b
commit 88fc854b4b
parent e28c1b93fd
15 changed files with 161 additions and 29 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -2706,6 +2706,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.embd_sep = value;
        }
    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+    add_opt(common_arg(
+        {"--cls-separator"}, "STRING",
+        "separator of classification sequences (default \\t) for example \"<#seq#>\"",
+        [](common_params & params, const std::string & value) {
+            params.cls_sep = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
    add_opt(common_arg(
        {"--host"}, "HOST",
        string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
--- a/common/common.h
+++ b/common/common.h
@ -358,6 +358,7 @@ struct common_params {
    int32_t embd_normalize = 2;     // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
    std::string embd_out   = "";    // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
    std::string embd_sep   = "\n";  // separator of embeddings
+    std::string cls_sep    = "\t";  // separator of classification sequences

    // server params
    int32_t port           = 8080;         // server listens on this network port