ggml-cuda : use graph allocator (#2684)

use a different function for no_alloc to avoid breaking backwards compat, fixes lora

remove 512 n_batch limit

fixed 2048 batch size

cleanup

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
This commit is contained in:
slaren 2023-08-22 15:25:19 +02:00 committed by GitHub
parent ef3f333d37
commit 1123f7fbdf
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 92 additions and 228 deletions

View file

@ -3887,13 +3887,13 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
// rope == RoPE == rotary positional embedding
static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
const float p_delta, const int p_delta_rows, const float theta_scale) {
const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
if (col >= ncols) {
return;
}
const int row = blockDim.y*blockIdx.y + threadIdx.y;
const int row = blockDim.x*blockIdx.x + threadIdx.x;
const int i = row*ncols + col;
const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
@ -3965,8 +3965,8 @@ static __global__ void alibi_f32(const float * x, float * dst, const int ncols,
}
static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
const int col = blockDim.x*blockIdx.x + threadIdx.x;
const int row = blockDim.y*blockIdx.y + threadIdx.y;
const int col = blockDim.y*blockIdx.y + threadIdx.y;
const int row = blockDim.x*blockIdx.x + threadIdx.x;
if (col >= ncols) {
return;
@ -3982,9 +3982,9 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
// values are also not normalized to the maximum value by subtracting it in the exponential function
// theoretically these changes could cause problems with rounding error and arithmetic overflow but for LLaMa it seems to be fine
static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
const int row = blockDim.y*blockIdx.y + threadIdx.y;
const int block_size = blockDim.x;
const int tid = threadIdx.x;
const int row = blockDim.x*blockIdx.x + threadIdx.x;
const int block_size = blockDim.y;
const int tid = threadIdx.y;
float tmp = 0.0;
@ -4776,9 +4776,9 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
GGML_ASSERT(nrows % 2 == 0);
const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1);
const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
const dim3 block_nums(num_blocks_x, nrows, 1);
const dim3 block_nums(nrows, num_blocks_x, 1);
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
}
@ -4800,15 +4800,15 @@ static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const
}
static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
const dim3 block_nums(block_num_x, nrows_x, 1);
const dim3 block_nums(nrows_x, block_num_x, 1);
diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
}
static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
const dim3 block_dims(WARP_SIZE, 1, 1);
const dim3 block_nums(1, nrows_x, 1);
const dim3 block_dims(1, WARP_SIZE, 1);
const dim3 block_nums(nrows_x, 1, 1);
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
}
@ -6313,7 +6313,7 @@ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
return extra;
}
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
if (scratch && g_scratch_size == 0) {
return;
}
@ -6322,14 +6322,19 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
const ggml_op src0_op = tensor->src[0]->op;
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
}
}
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace);
ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
}
tensor->backend = GGML_BACKEND_GPU;
if (scratch && no_alloc) {
return;
}
struct ggml_tensor_extra_gpu * extra;
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
@ -6381,16 +6386,48 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
tensor->extra = extra;
}
void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) {
if (g_scratch_size == 0) {
return;
}
if (g_scratch_buffer == nullptr) {
CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
}
struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
tensor->op == GGML_OP_VIEW;
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
size_t view_offset = 0;
if (tensor->op == GGML_OP_VIEW) {
memcpy(&view_offset, tensor->op_params, sizeof(size_t));
}
extra->data_device[g_main_device] = src0_ddc + view_offset;
} else {
extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
}
tensor->extra = extra;
}
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
ggml_cuda_assign_buffers_impl(tensor, true, false);
ggml_cuda_assign_buffers_impl(tensor, true, false, false);
}
void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
ggml_cuda_assign_buffers_impl(tensor, true, false, true);
}
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
ggml_cuda_assign_buffers_impl(tensor, false, false);
ggml_cuda_assign_buffers_impl(tensor, false, false, false);
}
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
ggml_cuda_assign_buffers_impl(tensor, false, true);
ggml_cuda_assign_buffers_impl(tensor, false, true, false);
}
void ggml_cuda_set_main_device(int main_device) {