Add --no-op-offload to improve -ot pp perf in MoE models like llama4 400B (#13386)

This commit is contained in:
David Huang 2025-05-11 20:18:39 +08:00 committed by GitHub
parent 3eac209319
commit 7f323a589f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 57 additions and 9 deletions

View file

@ -1113,6 +1113,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams.offload_kqv = !params.no_kv_offload;
cparams.flash_attn = params.flash_attn;
cparams.no_perf = params.no_perf;
cparams.op_offload = !params.no_op_offload;
if (params.reranking) {
cparams.embeddings = true;