Force FP32 compute in GLM4 FFN Down (#13101)
* Force FP32 compute in cuBLAS GEMM * Revert "Force FP32 compute in cuBLAS GEMM" This reverts commit 6efd872732159ab88ee7b3c1d77ba5ebc83079bd. * Force F32 compute in GLM4 ffn down * Edit comment to clarify issue Co-authored-by: Johannes Gäßler <johannesg@5d6.de> --------- Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
This commit is contained in:
parent
edb18b6e8f
commit
558a764713
1 changed files with 4 additions and 0 deletions
|
@ -803,6 +803,10 @@ ggml_tensor * llm_graph_context::build_ffn(
|
||||||
|
|
||||||
if (down) {
|
if (down) {
|
||||||
cur = build_lora_mm(down, cur);
|
cur = build_lora_mm(down, cur);
|
||||||
|
if (arch == LLM_ARCH_GLM4) {
|
||||||
|
// GLM4 seems to have numerical issues with half-precision accumulators
|
||||||
|
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (down_b) {
|
if (down_b) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue