llama: Add attention and final logit soft-capping, update scaling factor to Gemma2 (#8197)

* Add attention and final logit softcapping. * fix * Add custom add_ functions * Disable flash attention for Gemma2 * Update src/llama.cpp Co-authored-by: slaren <slarengh@gmail.com> * Add default value for attention and final logit softcap value * Add custom kq scaling from Gemma2Attention * Remove custom pre attention scaling and use computed value instead. --------- Co-authored-by: slaren <slarengh@gmail.com>
2024-06-29 20:44:08 -07:00 · 2024-06-29 20:44:08 -07:00 · 1c5eba6f8e
commit 1c5eba6f8e
parent 72272b83a3
4 changed files with 46 additions and 3 deletions
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -2363,6 +2363,12 @@ class Gemma2Model(Model):
        self.gguf_writer.add_key_length(hparams["head_dim"])
        self.gguf_writer.add_value_length(hparams["head_dim"])
        self.gguf_writer.add_file_type(self.ftype)
+        self.gguf_writer.add_attn_logit_softcapping(
+            self.hparams["attn_logit_softcapping"]
+        )
+        self.gguf_writer.add_final_logit_softcapping(
+            self.hparams["final_logit_softcapping"]
+        )

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unusem