MUSA: support ARM64 and enable dp4a .etc (#11843)
* MUSA: support ARM64 and enable __dp4a .etc * fix cross entropy loss op for musa * update * add cc info log for musa * add comment for the MUSA .cc calculation block --------- Co-authored-by: Bodhi Hu <huaishun.hu@mthreads.com>
This commit is contained in:
parent
ee02ad02c5
commit
0b3863ff95
7 changed files with 25 additions and 15 deletions
|
@ -411,13 +411,13 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
|
|||
|
||||
#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
|
||||
#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
|
||||
#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
|
||||
return __dp4a(a, b, c);
|
||||
#else // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
|
||||
#else // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
|
||||
const int8_t * a8 = (const int8_t *) &a;
|
||||
const int8_t * b8 = (const int8_t *) &b;
|
||||
return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
|
||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
|
||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
|
||||
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue