CUDA: fix non-cont. inputs for batched mat mul (#13155)
This commit is contained in:
parent
7d3af70b08
commit
cdf76586b2
4 changed files with 94 additions and 42 deletions
|
|
@ -3,7 +3,7 @@
|
|||
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
||||
|
||||
template<typename T>
|
||||
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);
|
||||
using to_t_cuda_t = void (*)(const void * x, T * y, int64_t k, cudaStream_t stream);
|
||||
|
||||
typedef to_t_cuda_t<float> to_fp32_cuda_t;
|
||||
typedef to_t_cuda_t<half> to_fp16_cuda_t;
|
||||
|
|
@ -14,3 +14,13 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type);
|
|||
to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type);
|
||||
|
||||
to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);
|
||||
|
||||
// TODO more general support for non-contiguous inputs
|
||||
|
||||
template<typename T>
|
||||
using to_t_nc_cuda_t = void (*)(const void * x, T * y,
|
||||
int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03,
|
||||
int64_t s01, int64_t s02, int64_t s03, cudaStream_t stream);
|
||||
|
||||
typedef to_t_nc_cuda_t<half> to_fp16_nc_cuda_t;
|
||||
to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue