metal : optimize multi-sequence FA vec kernel (#13493)
* batched-bench : fix pp batch contents * metal : optimize multi-sequence FA vec kernel ggml-ci
This commit is contained in:
parent
4f711afed5
commit
c252e0c409
1 changed files with 5 additions and 0 deletions
|
@ -3887,6 +3887,11 @@ kernel void kernel_flash_attn_ext_vec(
|
|||
sm[tiisg] = pm[ic + tiisg];
|
||||
}
|
||||
|
||||
// skip -INF blocks
|
||||
if (simd_max(sm[tiisg]) == -INFINITY) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Q*K^T
|
||||
{
|
||||
// each simdgroup processes 1 query and NE (NW/NL) head elements
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue