Add --no-op-offload to improve -ot pp perf in MoE models like llama4 400B (#13386)

This commit is contained in:
David Huang 2025-05-11 20:18:39 +08:00 committed by GitHub
parent 3eac209319
commit 7f323a589f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 57 additions and 9 deletions

View file

@ -2437,6 +2437,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
}
));
add_opt(common_arg(
{"--no-op-offload"},
string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"),
[](common_params & params) {
params.no_op_offload = true;
}
));
add_opt(common_arg(
{"--lora"}, "FNAME",
"path to LoRA adapter (can be repeated to use multiple adapters)",