CUDA/HIP: refractor mmqv to unify the calculation of nwarps and rows per block between host and device code. (#12177)

refactor mmqv to unify the calculation of nwarps and rows per block between host and device code.

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
This commit is contained in:
uvos 2025-03-11 20:16:03 +01:00 committed by GitHub
parent ba7654380a
commit 10f2e81809
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 142 additions and 59 deletions

View file

@ -395,11 +395,11 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
#if defined(CDNA) || defined(RDNA2) || defined(__gfx906__)
c = __builtin_amdgcn_sdot4(a, b, c, false);
#elif defined(RDNA3)
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
#elif defined(__gfx1010__) || defined(__gfx900__)
#elif defined(RDNA1) || defined(__gfx900__)
int tmp1;
int tmp2;
asm("\n \