CUDA: optimize FA for GQA + large batches (#12014)
This commit is contained in:
parent
335eb04a91
commit
5fa07c2f93
32 changed files with 940 additions and 411 deletions
|
@ -24,7 +24,7 @@ static __device__ __forceinline__ void cp_async_cg_16(const unsigned int dst, co
|
|||
} else
|
||||
#endif // CUDART_VERSION >= 11040
|
||||
{
|
||||
asm volatile("cp.async.cg.shared.global.L2 [%0], [%1], 16;"
|
||||
asm volatile("cp.async.cg.shared.global [%0], [%1], 16;"
|
||||
: : "r"(dst), "l"(src));
|
||||
}
|
||||
#else
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue