vulkan: Implement split_k for coopmat2 flash attention. (#12627)
When using group query attention, we have one workgroup per KV batch and this can be very few workgroups (e.g. just 8 in some models). Enable split_k to spread the work across SMs. This helps a lot when the KV cache is large.
This commit is contained in:
parent
6f3bd38640
commit
f01bd02376
5 changed files with 177 additions and 17 deletions
|
@ -4516,6 +4516,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
|||
}
|
||||
}
|
||||
|
||||
for (int kv : { 4096, 8192, 16384, }) {
|
||||
for (int hs : { 64, 128, }) {
|
||||
test_cases.emplace_back(new test_flash_attn_ext(hs, hs, 8, 4, kv, 1, true, 0, 0, GGML_PREC_F32, GGML_TYPE_F16));
|
||||
}
|
||||
}
|
||||
|
||||
return test_cases;
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue