llama : move end-user examples to tools directory (#13249)

* llama : move end-user examples to tools directory --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
2025-05-02 20:27:13 +02:00 · 2025-05-02 20:27:13 +02:00 · 1d36b3670b
commit 1d36b3670b
parent b34443923c
213 changed files with 226 additions and 190 deletions
--- a/examples/server/tests/unit/test_infill.py
+++ b/examples/server/tests/unit/test_infill.py
@ -1,77 +0,0 @@
-import pytest
-from utils import *
-
-server = ServerPreset.tinyllama_infill()
-
-@pytest.fixture(scope="module", autouse=True)
-def create_server():
-    global server
-    server = ServerPreset.tinyllama_infill()
-
-
-def test_infill_without_input_extra():
-    global server
-    server.start()
-    res = server.make_request("POST", "/infill", data={
-        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
-        "prompt": "    int n_threads = llama_",
-        "input_suffix": "}\n",
-    })
-    assert res.status_code == 200
-    assert match_regex("(Ann|small|shiny|Daddy)+", res.body["content"])
-
-
-def test_infill_with_input_extra():
-    global server
-    server.start()
-    res = server.make_request("POST", "/infill", data={
-        "input_extra": [{
-            "filename": "llama.h",
-            "text": "LLAMA_API int32_t llama_n_threads();\n"
-        }],
-        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
-        "prompt": "    int n_threads = llama_",
-        "input_suffix": "}\n",
-    })
-    assert res.status_code == 200
-    assert match_regex("(Dad|excited|park)+", res.body["content"])
-
-
-@pytest.mark.parametrize("input_extra", [
-    {},
-    {"filename": "ok"},
-    {"filename": 123},
-    {"filename": 123, "text": "abc"},
-    {"filename": 123, "text": 456},
-])
-def test_invalid_input_extra_req(input_extra):
-    global server
-    server.start()
-    res = server.make_request("POST", "/infill", data={
-        "input_extra": [input_extra],
-        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
-        "prompt": "    int n_threads = llama_",
-        "input_suffix": "}\n",
-    })
-    assert res.status_code == 400
-    assert "error" in res.body
-
-
-@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test")
-def test_with_qwen_model():
-    global server
-    server.model_file = None
-    server.model_hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-IQ3_XXS-GGUF"
-    server.model_hf_file = "qwen2.5-coder-1.5b-iq3_xxs-imat.gguf"
-    server.start(timeout_seconds=600)
-    res = server.make_request("POST", "/infill", data={
-        "input_extra": [{
-            "filename": "llama.h",
-            "text": "LLAMA_API int32_t llama_n_threads();\n"
-        }],
-        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
-        "prompt": "    int n_threads = llama_",
-        "input_suffix": "}\n",
-    })
-    assert res.status_code == 200
-    assert res.body["content"] == "n_threads();\n    printf(\"Number of threads: %d\\n\", n_threads);\n    return 0;\n"