Add --no-op-offload to improve -ot pp perf in MoE models like llama4 400B (#13386)

2025-05-11 20:18:39 +08:00 · 2025-05-11 20:18:39 +08:00 · 7f323a589f
commit 7f323a589f
parent 3eac209319
11 changed files with 57 additions and 9 deletions
--- a/common/common.h
+++ b/common/common.h
@ -332,6 +332,7 @@ struct common_params {
    bool no_kv_offload     = false; // disable KV offloading
    bool warmup            = true;  // warmup run
    bool check_tensors     = false; // validate tensor data
+    bool no_op_offload     = false; // globally disable offload host tensor operations to device

    bool single_turn       = false; // single turn chat conversation