CUDA: optimize FA for GQA + large batches (#12014)

This commit is contained in:
Johannes Gäßler 2025-02-22 12:20:17 +01:00 committed by GitHub
parent 335eb04a91
commit 5fa07c2f93
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
32 changed files with 940 additions and 411 deletions

View file

@ -24,7 +24,7 @@ static __device__ __forceinline__ void cp_async_cg_16(const unsigned int dst, co
} else
#endif // CUDART_VERSION >= 11040
{
asm volatile("cp.async.cg.shared.global.L2 [%0], [%1], 16;"
asm volatile("cp.async.cg.shared.global [%0], [%1], 16;"
: : "r"(dst), "l"(src));
}
#else