cuda : fix defrag with quantized KV (#9319)
This commit is contained in:
parent
bdf314f38a
commit
4db04784f9
3 changed files with 40 additions and 19 deletions
|
@ -1165,6 +1165,11 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|||
}
|
||||
}
|
||||
|
||||
if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
|
||||
// since the tensor is pre-allocated, it cannot be moved to another backend
|
||||
GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
|
||||
}
|
||||
|
||||
// graph input
|
||||
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||
cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
|
||||
|
@ -1644,7 +1649,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||
sched->prev_leaf_backend_ids = tmp;
|
||||
}
|
||||
|
||||
int graph_size = graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
||||
int graph_size = MAX(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
|
||||
if (sched->graph.size < graph_size) {
|
||||
sched->graph.size = graph_size;
|
||||
sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
|
||||
|
@ -1696,6 +1701,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||
for (int c = 0; c < sched->n_copies; c++) {
|
||||
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
||||
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
||||
assert(graph_copy->size > graph_copy->n_leafs);
|
||||
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
||||
}
|
||||
}
|
||||
|
@ -1709,6 +1715,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||
for (int c = 0; c < sched->n_copies; c++) {
|
||||
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
||||
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
||||
assert(graph_copy->size > graph_copy->n_leafs);
|
||||
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
||||
}
|
||||
}
|
||||
|
@ -1719,6 +1726,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||
for (int i = 0; i < graph->n_leafs; i++) {
|
||||
struct ggml_tensor * leaf = graph->leafs[i];
|
||||
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
|
||||
assert(graph_copy->size > graph_copy->n_leafs);
|
||||
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue