Add --no-op-offload
to improve -ot
pp perf in MoE models like llama4 400B (#13386)
This commit is contained in:
parent
3eac209319
commit
7f323a589f
11 changed files with 57 additions and 9 deletions
|
@ -363,6 +363,7 @@ extern "C" {
|
|||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
||||
bool no_perf; // whether to measure performance timings
|
||||
bool op_offload; // whether to offload host tensor operations to device
|
||||
};
|
||||
|
||||
// model quantization parameters
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue