
* CUDA: Simplify and improve CUDA graphs through use of indirect copy pointers Previously there was complexity in the CUDA graphs implementation due frequently changing parameters to copy kernels associated with K and V cache pointers. This patch simplifies by using indirection to avoid such parameters frequently changing, avoiding the need for frequent graph updates. Fixes #12152 * Addressed comments * fix HIP builds * properly sync to stream * removed ggml_cuda_cpy_fn_ptrs * move stream sync before free * guard to only use indirection with graphs * style fixes * check for errors --------- Co-authored-by: slaren <slarengh@gmail.com>
11 lines
440 B
Text
11 lines
440 B
Text
#include "common.cuh"
|
|
|
|
#define CUDA_CPY_BLOCK_SIZE 64
|
|
|
|
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1);
|
|
|
|
void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
|
|
void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1);
|
|
|
|
void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream);
|