CUDA: fix non-cont. inputs for batched mat mul (#13155)

2025-04-29 16:00:27 +02:00 · 2025-04-29 16:00:27 +02:00 · cdf76586b2
commit cdf76586b2
parent 7d3af70b08
4 changed files with 94 additions and 42 deletions
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -10195,7 +10195,6 @@ struct llm_build_deepseek2 : public llm_graph_context {

                    // {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
                    ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
-                    ggml_mul_mat_set_prec(q_nope_absorbed, GGML_PREC_F32);
                    cb(q_nope_absorbed, "q_nope_absorbed", il);

                    // {kv_lora_rank, n_head, n_tokens}