metal : optimize multi-sequence FA vec kernel (#13493)
* batched-bench : fix pp batch contents * metal : optimize multi-sequence FA vec kernel ggml-ci
This commit is contained in:
parent
4f711afed5
commit
c252e0c409
1 changed files with 5 additions and 0 deletions
|
@ -3887,6 +3887,11 @@ kernel void kernel_flash_attn_ext_vec(
|
||||||
sm[tiisg] = pm[ic + tiisg];
|
sm[tiisg] = pm[ic + tiisg];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// skip -INF blocks
|
||||||
|
if (simd_max(sm[tiisg]) == -INFINITY) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// Q*K^T
|
// Q*K^T
|
||||||
{
|
{
|
||||||
// each simdgroup processes 1 query and NE (NW/NL) head elements
|
// each simdgroup processes 1 query and NE (NW/NL) head elements
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue