llama : fix FA when KV cache is not used (i.e. embeddings) (#12825)

* ggml : FA supports F32 V

* graph : cast KV to F16 when the KV cache is not used

ggml-ci

* server : add test that exercises embeddings with FA enabled

ggml-ci
This commit is contained in:
Georgi Gerganov 2025-04-08 19:54:51 +03:00 committed by GitHub
parent 78a1ba0a4f
commit a19b5cef16
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 59 additions and 6 deletions

View file

@ -49,6 +49,26 @@ def test_embedding_multiple():
assert len(d['embedding']) > 1
def test_embedding_multiple_with_fa():
server = ServerPreset.bert_bge_small_with_fa()
server.pooling = 'last'
server.start()
# one of these should trigger the FA branch (i.e. context size % 256 == 0)
res = server.make_request("POST", "/v1/embeddings", data={
"input": [
"a "*253,
"b "*254,
"c "*255,
"d "*256,
],
})
assert res.status_code == 200
assert len(res.body['data']) == 4
for d in res.body['data']:
assert 'embedding' in d
assert len(d['embedding']) > 1
@pytest.mark.parametrize(
"input,is_multi_prompt",
[

View file

@ -323,6 +323,21 @@ class ServerPreset:
server.server_embeddings = True
return server
@staticmethod
def bert_bge_small_with_fa() -> ServerProcess:
server = ServerProcess()
server.model_hf_repo = "ggml-org/models"
server.model_hf_file = "bert-bge-small/ggml-model-f16.gguf"
server.model_alias = "bert-bge-small"
server.n_ctx = 1024
server.n_batch = 300
server.n_ubatch = 300
server.n_slots = 2
server.fa = True
server.seed = 42
server.server_embeddings = True
return server
@staticmethod
def tinyllama_infill() -> ServerProcess:
server = ServerProcess()

View file

@ -15,7 +15,7 @@ async def main():
model_url = "http://127.0.0.1:6900"
responses: list[requests.Response] = await asyncio.gather(*[requests_post_async(
url= f"{model_url}/embedding",
json= {"content": str(0)*1024}
json= {"content": "a "*1022}
) for i in range(n)])
for response in responses: