CUDA: non-contiguous (RMS) norm support (#11659)

* CUDA: non-contiguous (RMS) norm support

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Johannes Gäßler 2025-02-04 22:21:42 +01:00 committed by GitHub
parent 3ec9fd4b77
commit fd08255d0d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 97 additions and 47 deletions

View file

@ -4610,7 +4610,8 @@ struct llm_build_context {
ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
cb(k_pe, "k_pe", il);
kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
// TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
kv_compressed = ggml_cont(ctx0, kv_compressed);
kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
model.layers[il].attn_kv_a_norm, NULL,
LLM_NORM_RMS, cb, il);
@ -6464,7 +6465,8 @@ struct llm_build_context {
ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
cb(k_pe, "k_pe", il);
kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
// TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
kv_compressed = ggml_cont(ctx0, kv_compressed);
kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
model.layers[il].attn_kv_a_norm, NULL,
LLM_NORM_RMS, cb, il);