CUDA: app option to compile without FlashAttention (#12025)

This commit is contained in:
Johannes Gäßler 2025-02-22 20:44:34 +01:00 committed by GitHub
parent 36c258ee92
commit a28e0d5eb1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
13 changed files with 46 additions and 31 deletions

View file

@ -44,10 +44,7 @@ static __global__ void flash_attn_tile_ext_f32(
const int ne1,
const int ne2,
const int ne3) {
#ifndef FLASH_ATTN_AVAILABLE
NO_DEVICE_CODE;
return;
#endif // FLASH_ATTN_AVAILABLE
#ifdef FLASH_ATTN_AVAILABLE
// Skip unused kernel variants for faster compilation:
#ifdef FP16_MMA_AVAILABLE
@ -285,6 +282,9 @@ static __global__ void flash_attn_tile_ext_f32(
dst_meta[(ic0 + j_VKQ)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
}
}
#else
NO_DEVICE_CODE;
#endif // FLASH_ATTN_AVAILABLE
}
template <int cols_per_block, int parallel_blocks, bool use_logit_softcap>