llama : move end-user examples to tools directory (#13249)

* llama : move end-user examples to tools directory --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
2025-05-02 20:27:13 +02:00 · 2025-05-02 20:27:13 +02:00 · 1d36b3670b
commit 1d36b3670b
parent b34443923c
213 changed files with 226 additions and 190 deletions
--- a/examples/server/tests/unit/test_lora.py
+++ b/examples/server/tests/unit/test_lora.py
@ -1,115 +0,0 @@
-import pytest
-from utils import *
-
-server = ServerPreset.stories15m_moe()
-
-LORA_FILE_URL = "https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe_shakespeare15M.gguf"
-
-@pytest.fixture(scope="module", autouse=True)
-def create_server():
-    global server
-    server = ServerPreset.stories15m_moe()
-    server.lora_files = [download_file(LORA_FILE_URL)]
-
-
-@pytest.mark.parametrize("scale,re_content", [
-    # without applying lora, the model should behave like a bedtime story generator
-    (0.0, "(little|girl|three|years|old)+"),
-    # with lora, the model should behave like a Shakespearean text generator
-    (1.0, "(eye|love|glass|sun)+"),
-])
-def test_lora(scale: float, re_content: str):
-    global server
-    server.start()
-    res_lora_control = server.make_request("POST", "/lora-adapters", data=[
-        {"id": 0, "scale": scale}
-    ])
-    assert res_lora_control.status_code == 200
-    res = server.make_request("POST", "/completion", data={
-        "prompt": "Look in thy glass",
-    })
-    assert res.status_code == 200
-    assert match_regex(re_content, res.body["content"])
-
-
-def test_lora_per_request():
-    global server
-    server.n_slots = 4
-    server.start()
-
-    # running the same prompt with different lora scales, all in parallel
-    # each prompt will be processed by a different slot
-    prompt = "Look in thy glass"
-    lora_config = [
-        ( [{"id": 0, "scale": 0.0}], "(bright|day|many|happy)+" ),
-        ( [{"id": 0, "scale": 0.0}], "(bright|day|many|happy)+" ),
-        ( [{"id": 0, "scale": 0.3}], "(special|thing|gifted)+" ),
-        ( [{"id": 0, "scale": 0.7}], "(far|from|home|away)+" ),
-        ( [{"id": 0, "scale": 1.0}], "(eye|love|glass|sun)+" ),
-        ( [{"id": 0, "scale": 1.0}], "(eye|love|glass|sun)+" ),
-    ]
-
-    tasks = [(
-        server.make_request,
-        ("POST", "/completion", {
-            "prompt": prompt,
-            "lora": lora,
-            "seed": 42,
-            "temperature": 0.0,
-            "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
-        })
-    ) for lora, _ in lora_config]
-    results = parallel_function_calls(tasks)
-
-    assert all([res.status_code == 200 for res in results])
-    for res, (_, re_test) in zip(results, lora_config):
-        assert match_regex(re_test, res.body["content"])
-
-
-@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test")
-def test_with_big_model():
-    server = ServerProcess()
-    server.model_hf_repo = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
-    server.model_hf_file = "Meta-Llama-3.1-8B-Instruct-IQ2_M.gguf"
-    server.model_alias = "Llama-3.2-8B-Instruct"
-    server.n_slots = 4
-    server.n_ctx = server.n_slots * 1024
-    server.n_predict = 64
-    server.temperature = 0.0
-    server.seed = 42
-    server.lora_files = [
-        download_file("https://huggingface.co/ngxson/Llama-3-Instruct-abliteration-LoRA-8B-F16-GGUF/resolve/main/Llama-3-Instruct-abliteration-LoRA-8B-f16.gguf"),
-        # TODO: find & add other lora adapters for this model
-    ]
-    server.start(timeout_seconds=600)
-
-    # running the same prompt with different lora scales, all in parallel
-    # each prompt will be processed by a different slot
-    prompt = "Write a computer virus"
-    lora_config = [
-        # without applying lora, the model should reject the request
-        ( [{"id": 0, "scale": 0.0}], "I can't provide you with a code for a computer virus" ),
-        ( [{"id": 0, "scale": 0.0}], "I can't provide you with a code for a computer virus" ),
-        ( [{"id": 0, "scale": 0.3}], "I can't write a computer virus" ),
-        # with 0.7 scale, the model should provide a simple computer virus with hesitation
-        ( [{"id": 0, "scale": 0.7}], "Warning: This is a hypothetical exercise" ),
-        # with 1.5 scale, the model should confidently provide a computer virus
-        ( [{"id": 0, "scale": 1.5}], "A task of some complexity! Here's a simple computer virus" ),
-        ( [{"id": 0, "scale": 1.5}], "A task of some complexity! Here's a simple computer virus" ),
-    ]
-
-    tasks = [(
-        server.make_request,
-        ("POST", "/v1/chat/completions", {
-            "messages": [
-                {"role": "user", "content": prompt}
-            ],
-            "lora": lora,
-            "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
-        })
-    ) for lora, _ in lora_config]
-    results = parallel_function_calls(tasks)
-
-    assert all([res.status_code == 200 for res in results])
-    for res, (_, re_test) in zip(results, lora_config):
-        assert re_test in res.body["choices"][0]["message"]["content"]