llama/ggml: add LLM training support (#10544)
* llama/ggml: add LLM training support more compact progress bar llama_save_model_to_file llama_opt_param_filter ggml_graph_dup force_grads refactor ggml_opt, fix test-opt * remove logits_all * refactor CUDA implementation for ACC * reset graph at beginning of opt period
This commit is contained in:
parent
064cc596ac
commit
10d2af0eaa
31 changed files with 1415 additions and 359 deletions
|
@ -5499,7 +5499,7 @@ static void ggml_compute_backward(
|
|||
// tensor = src0 * 1 + src1 * 0
|
||||
if (src0_needs_grads) {
|
||||
// dsrc0 = dtensor * 1
|
||||
ggml_add_or_set(ctx, cgraph, isrc0, grad);
|
||||
ggml_add_or_set(ctx, cgraph, isrc0, ggml_reshape(ctx, grad, src0));
|
||||
}
|
||||
if (src1_needs_grads) {
|
||||
// dsrc1 = dtensor * 0 -> noop
|
||||
|
@ -5780,10 +5780,9 @@ void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
|||
}
|
||||
|
||||
void ggml_build_backward_expand(
|
||||
struct ggml_context * ctx_static,
|
||||
struct ggml_context * ctx_compute,
|
||||
struct ggml_cgraph * cgraph,
|
||||
bool accumulate) {
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_cgraph * cgraph,
|
||||
struct ggml_tensor ** grad_accs) {
|
||||
GGML_ASSERT(cgraph->n_nodes > 0);
|
||||
GGML_ASSERT(cgraph->grads);
|
||||
GGML_ASSERT(cgraph->grad_accs);
|
||||
|
@ -5856,21 +5855,24 @@ void ggml_build_backward_expand(
|
|||
GGML_ASSERT(!node->view_src || node->op == GGML_OP_CPY || node->op == GGML_OP_VIEW ||
|
||||
node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE);
|
||||
|
||||
const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
|
||||
GGML_ASSERT(igrad != GGML_HASHSET_FULL);
|
||||
GGML_ASSERT(ggml_bitset_get(cgraph->visited_hash_set.used, igrad));
|
||||
if ((accumulate && (node->flags & GGML_TENSOR_FLAG_PARAM)) || (node->flags & GGML_TENSOR_FLAG_LOSS)) {
|
||||
cgraph->grad_accs[igrad] = ggml_dup_tensor(ctx_static, node);
|
||||
cgraph->grads[igrad] = cgraph->grad_accs[igrad];
|
||||
ggml_format_name(cgraph->grad_accs[igrad], "grad acc for %s", node->name);
|
||||
const size_t ihash = ggml_hash_find(&cgraph->visited_hash_set, node);
|
||||
GGML_ASSERT(ihash != GGML_HASHSET_FULL);
|
||||
GGML_ASSERT(ggml_bitset_get(cgraph->visited_hash_set.used, ihash));
|
||||
if (grad_accs && grad_accs[i]) {
|
||||
cgraph->grad_accs[ihash] = grad_accs[i];
|
||||
cgraph->grads[ihash] = cgraph->grad_accs[ihash];
|
||||
} else if (node->flags & GGML_TENSOR_FLAG_LOSS) {
|
||||
// loss tensors always need a gradient accumulator
|
||||
cgraph->grad_accs[ihash] = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
|
||||
cgraph->grads[ihash] = cgraph->grad_accs[ihash];
|
||||
}
|
||||
grads_needed[igrad] = true;
|
||||
grads_needed[ihash] = true;
|
||||
}
|
||||
|
||||
for (int i = n_nodes_f - 1; i >= 0; --i) {
|
||||
// inplace operations to add gradients are not created by ggml_compute_backward except for gradient accumulation
|
||||
// use allocator to automatically make inplace operations
|
||||
ggml_compute_backward(ctx_compute, cgraph, i, grads_needed);
|
||||
ggml_compute_backward(ctx, cgraph, i, grads_needed);
|
||||
}
|
||||
|
||||
free(grads_needed);
|
||||
|
@ -6016,8 +6018,8 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
|
|||
}
|
||||
}
|
||||
|
||||
struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
|
||||
struct ggml_cgraph * result = ggml_new_graph_custom(ctx, cgraph->size, cgraph->grads != NULL);
|
||||
struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, struct ggml_cgraph * cgraph, bool force_grads) {
|
||||
struct ggml_cgraph * result = ggml_new_graph_custom(ctx, cgraph->size, cgraph->grads || force_grads);
|
||||
ggml_graph_cpy(cgraph, result);
|
||||
return result;
|
||||
}
|
||||
|
@ -6036,6 +6038,9 @@ struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
|
|||
}
|
||||
|
||||
void ggml_graph_reset(struct ggml_cgraph * cgraph) {
|
||||
if (!cgraph) {
|
||||
return;
|
||||
}
|
||||
GGML_ASSERT(cgraph->grads != NULL);
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
|
@ -6345,8 +6350,8 @@ void ggml_set_output(struct ggml_tensor * tensor) {
|
|||
tensor->flags |= GGML_TENSOR_FLAG_OUTPUT;
|
||||
}
|
||||
|
||||
void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor) {
|
||||
GGML_UNUSED(ctx); // TODO: remove this parameter
|
||||
void ggml_set_param(struct ggml_tensor * tensor) {
|
||||
GGML_ASSERT(tensor->op == GGML_OP_NONE);
|
||||
tensor->flags |= GGML_TENSOR_FLAG_PARAM;
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue