CUDA: app option to compile without FlashAttention (#12025)
This commit is contained in:
parent
36c258ee92
commit
a28e0d5eb1
13 changed files with 46 additions and 31 deletions
|
|
@ -44,10 +44,7 @@ static __global__ void flash_attn_tile_ext_f32(
|
|||
const int ne1,
|
||||
const int ne2,
|
||||
const int ne3) {
|
||||
#ifndef FLASH_ATTN_AVAILABLE
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
#endif // FLASH_ATTN_AVAILABLE
|
||||
#ifdef FLASH_ATTN_AVAILABLE
|
||||
|
||||
// Skip unused kernel variants for faster compilation:
|
||||
#ifdef FP16_MMA_AVAILABLE
|
||||
|
|
@ -285,6 +282,9 @@ static __global__ void flash_attn_tile_ext_f32(
|
|||
dst_meta[(ic0 + j_VKQ)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
|
||||
}
|
||||
}
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // FLASH_ATTN_AVAILABLE
|
||||
}
|
||||
|
||||
template <int cols_per_block, int parallel_blocks, bool use_logit_softcap>
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue