CUDA: fix non-cont. inputs for batched mat mul (#13155)
This commit is contained in:
parent
7d3af70b08
commit
cdf76586b2
4 changed files with 94 additions and 42 deletions
|
@ -10195,7 +10195,6 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|||
|
||||
// {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
|
||||
ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
|
||||
ggml_mul_mat_set_prec(q_nope_absorbed, GGML_PREC_F32);
|
||||
cb(q_nope_absorbed, "q_nope_absorbed", il);
|
||||
|
||||
// {kv_lora_rank, n_head, n_tokens}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue