llama-bench: add -d
depth arg (#13096)
* add depth param * update llama-bench README and add depth param * llama-bench: default params for depth arg for faster execution * Update examples/llama-bench/README.md Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * fix buffer print ub * use user provided args * remove extra whitespaces --------- Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
This commit is contained in:
parent
4e87962e34
commit
1831f538f7
2 changed files with 137 additions and 65 deletions
|
@ -28,6 +28,7 @@ options:
|
|||
-p, --n-prompt <n> (default: 512)
|
||||
-n, --n-gen <n> (default: 128)
|
||||
-pg <pp,tg> (default: )
|
||||
-d, --n-depth <n> (default: 0)
|
||||
-b, --batch-size <n> (default: 2048)
|
||||
-ub, --ubatch-size <n> (default: 512)
|
||||
-ctk, --cache-type-k <t> (default: f16)
|
||||
|
@ -66,6 +67,8 @@ With the exception of `-r`, `-o` and `-v`, all options can be specified multiple
|
|||
|
||||
Each test is repeated the number of times given by `-r`, and the results are averaged. The results are given in average tokens per second (t/s) and standard deviation. Some output formats (e.g. json) also include the individual results of each repetition.
|
||||
|
||||
Using the `-d <n>` option, each test can be run at a specified context depth, prefilling the KV cache with `<n>` tokens.
|
||||
|
||||
For a description of the other options, see the [main example](../main/README.md).
|
||||
|
||||
Note:
|
||||
|
@ -148,6 +151,19 @@ $ ./llama-bench -ngl 10,20,30,31,32,33,34,35
|
|||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | pp 512 | 2400.01 ± 7.72 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | tg 128 | 131.66 ± 0.49 |
|
||||
|
||||
### Different prefilled context
|
||||
|
||||
```
|
||||
$ ./llama-bench -d 0,512
|
||||
```
|
||||
|
||||
| model | size | params | backend | ngl | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
|
||||
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | pp512 | 7340.20 ± 23.45 |
|
||||
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | tg128 | 120.60 ± 0.59 |
|
||||
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | pp512 @ d512 | 6425.91 ± 18.88 |
|
||||
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | tg128 @ d512 | 116.71 ± 0.60 |
|
||||
|
||||
## Output formats
|
||||
|
||||
By default, llama-bench outputs the results in markdown format. The results can be output in other formats by using the `-o` option.
|
||||
|
@ -170,9 +186,9 @@ $ ./llama-bench -o csv
|
|||
```
|
||||
|
||||
```csv
|
||||
build_commit,build_number,cuda,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
|
||||
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
|
||||
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"
|
||||
build_commit,build_number,cpu_info,gpu_info,backends,model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch,n_threads,cpu_mask,cpu_strict,poll,type_k,type_v,n_gpu_layers,split_mode,main_gpu,no_kv_offload,flash_attn,tensor_split,use_mmap,embeddings,n_prompt,n_gen,n_depth,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
|
||||
"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","512","0","0","2025-04-24T11:57:09Z","70285660","982040","7285.676949","100.064434"
|
||||
"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","0","128","0","2025-04-24T11:57:10Z","1067431600","3834831","119.915244","0.430617"
|
||||
```
|
||||
|
||||
### JSON
|
||||
|
@ -184,64 +200,78 @@ $ ./llama-bench -o json
|
|||
```json
|
||||
[
|
||||
{
|
||||
"build_commit": "3469684",
|
||||
"build_number": 1275,
|
||||
"cuda": true,
|
||||
"metal": false,
|
||||
"gpu_blas": true,
|
||||
"blas": true,
|
||||
"cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
|
||||
"gpu_info": "NVIDIA GeForce RTX 3090 Ti",
|
||||
"model_filename": "models/7B/ggml-model-q4_0.gguf",
|
||||
"model_type": "llama 7B mostly Q4_0",
|
||||
"model_size": 3825065984,
|
||||
"model_n_params": 6738415616,
|
||||
"n_batch": 512,
|
||||
"n_threads": 16,
|
||||
"f16_kv": true,
|
||||
"build_commit": "8cf427ff",
|
||||
"build_number": 5163,
|
||||
"cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor",
|
||||
"gpu_info": "NVIDIA GeForce RTX 4080",
|
||||
"backends": "CUDA",
|
||||
"model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf",
|
||||
"model_type": "qwen2 7B Q4_K - Medium",
|
||||
"model_size": 4677120000,
|
||||
"model_n_params": 7615616512,
|
||||
"n_batch": 2048,
|
||||
"n_ubatch": 512,
|
||||
"n_threads": 8,
|
||||
"cpu_mask": "0x0",
|
||||
"cpu_strict": false,
|
||||
"poll": 50,
|
||||
"type_k": "f16",
|
||||
"type_v": "f16",
|
||||
"n_gpu_layers": 99,
|
||||
"split_mode": "layer",
|
||||
"main_gpu": 0,
|
||||
"mul_mat_q": true,
|
||||
"no_kv_offload": false,
|
||||
"flash_attn": false,
|
||||
"tensor_split": "0.00",
|
||||
"use_mmap": true,
|
||||
"embeddings": false,
|
||||
"n_prompt": 512,
|
||||
"n_gen": 0,
|
||||
"test_time": "2023-09-23T12:09:57Z",
|
||||
"avg_ns": 212365953,
|
||||
"stddev_ns": 985423,
|
||||
"avg_ts": 2410.974041,
|
||||
"stddev_ts": 11.163766,
|
||||
"samples_ns": [ 213837238, 211635853, 212328053, 211329715, 212698907 ],
|
||||
"samples_ts": [ 2394.34, 2419.25, 2411.36, 2422.75, 2407.16 ]
|
||||
"n_depth": 0,
|
||||
"test_time": "2025-04-24T11:58:50Z",
|
||||
"avg_ns": 72135640,
|
||||
"stddev_ns": 1453752,
|
||||
"avg_ts": 7100.002165,
|
||||
"stddev_ts": 140.341520,
|
||||
"samples_ns": [ 74601900, 71632900, 71745200, 71952700, 70745500 ],
|
||||
"samples_ts": [ 6863.1, 7147.55, 7136.37, 7115.79, 7237.21 ]
|
||||
},
|
||||
{
|
||||
"build_commit": "3469684",
|
||||
"build_number": 1275,
|
||||
"cuda": true,
|
||||
"metal": false,
|
||||
"gpu_blas": true,
|
||||
"blas": true,
|
||||
"cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
|
||||
"gpu_info": "NVIDIA GeForce RTX 3090 Ti",
|
||||
"model_filename": "models/7B/ggml-model-q4_0.gguf",
|
||||
"model_type": "llama 7B mostly Q4_0",
|
||||
"model_size": 3825065984,
|
||||
"model_n_params": 6738415616,
|
||||
"n_batch": 512,
|
||||
"n_threads": 16,
|
||||
"f16_kv": true,
|
||||
"build_commit": "8cf427ff",
|
||||
"build_number": 5163,
|
||||
"cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor",
|
||||
"gpu_info": "NVIDIA GeForce RTX 4080",
|
||||
"backends": "CUDA",
|
||||
"model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf",
|
||||
"model_type": "qwen2 7B Q4_K - Medium",
|
||||
"model_size": 4677120000,
|
||||
"model_n_params": 7615616512,
|
||||
"n_batch": 2048,
|
||||
"n_ubatch": 512,
|
||||
"n_threads": 8,
|
||||
"cpu_mask": "0x0",
|
||||
"cpu_strict": false,
|
||||
"poll": 50,
|
||||
"type_k": "f16",
|
||||
"type_v": "f16",
|
||||
"n_gpu_layers": 99,
|
||||
"split_mode": "layer",
|
||||
"main_gpu": 0,
|
||||
"mul_mat_q": true,
|
||||
"no_kv_offload": false,
|
||||
"flash_attn": false,
|
||||
"tensor_split": "0.00",
|
||||
"use_mmap": true,
|
||||
"embeddings": false,
|
||||
"n_prompt": 0,
|
||||
"n_gen": 128,
|
||||
"test_time": "2023-09-23T12:09:59Z",
|
||||
"avg_ns": 977425219,
|
||||
"stddev_ns": 9268593,
|
||||
"avg_ts": 130.965708,
|
||||
"stddev_ts": 1.238924,
|
||||
"samples_ns": [ 984472709, 974901233, 989474741, 970729355, 967548060 ],
|
||||
"samples_ts": [ 130.019, 131.295, 129.362, 131.86, 132.293 ]
|
||||
"n_depth": 0,
|
||||
"test_time": "2025-04-24T11:58:51Z",
|
||||
"avg_ns": 1076767880,
|
||||
"stddev_ns": 9449585,
|
||||
"avg_ts": 118.881588,
|
||||
"stddev_ts": 1.041811,
|
||||
"samples_ns": [ 1075361300, 1065089400, 1071761200, 1081934900, 1089692600 ],
|
||||
"samples_ts": [ 119.03, 120.178, 119.43, 118.307, 117.464 ]
|
||||
}
|
||||
]
|
||||
```
|
||||
|
@ -254,8 +284,8 @@ $ ./llama-bench -o jsonl
|
|||
```
|
||||
|
||||
```json lines
|
||||
{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":512,"n_gen":0,"test_time":"2023-09-23T12:09:57Z","avg_ns":212365953,"stddev_ns":985423,"avg_ts":2410.974041,"stddev_ts":11.163766,"samples_ns":[213837238,211635853,212328053,211329715,212698907],"samples_ts":[2394.34,2419.25,2411.36,2422.75,2407.16]}
|
||||
{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":0,"n_gen":128,"test_time":"2023-09-23T12:09:59Z","avg_ns":977425219,"stddev_ns":9268593,"avg_ts":130.965708,"stddev_ts":1.238924,"samples_ns":[984472709,974901233,989474741,970729355,967548060],"samples_ts":[130.019,131.295,129.362,131.86,132.293]}
|
||||
{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 512, "n_gen": 0, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 70497220, "stddev_ns": 883196, "avg_ts": 7263.609157, "stddev_ts": 90.940578, "samples_ns": [ 71551000, 71222800, 70364100, 69439100, 69909100 ],"samples_ts": [ 7155.74, 7188.71, 7276.44, 7373.37, 7323.8 ]}
|
||||
{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 0, "n_gen": 128, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 1068078400, "stddev_ns": 6279455, "avg_ts": 119.844681, "stddev_ts": 0.699739, "samples_ns": [ 1066331700, 1064864900, 1079042600, 1063328400, 1066824400 ],"samples_ts": [ 120.038, 120.203, 118.624, 120.377, 119.982 ]}
|
||||
```
|
||||
|
||||
|
||||
|
@ -271,25 +301,32 @@ $ ./llama-bench -o sql
|
|||
CREATE TABLE IF NOT EXISTS test (
|
||||
build_commit TEXT,
|
||||
build_number INTEGER,
|
||||
cuda INTEGER,
|
||||
metal INTEGER,
|
||||
gpu_blas INTEGER,
|
||||
blas INTEGER,
|
||||
cpu_info TEXT,
|
||||
gpu_info TEXT,
|
||||
backends TEXT,
|
||||
model_filename TEXT,
|
||||
model_type TEXT,
|
||||
model_size INTEGER,
|
||||
model_n_params INTEGER,
|
||||
n_batch INTEGER,
|
||||
n_ubatch INTEGER,
|
||||
n_threads INTEGER,
|
||||
f16_kv INTEGER,
|
||||
cpu_mask TEXT,
|
||||
cpu_strict INTEGER,
|
||||
poll INTEGER,
|
||||
type_k TEXT,
|
||||
type_v TEXT,
|
||||
n_gpu_layers INTEGER,
|
||||
split_mode TEXT,
|
||||
main_gpu INTEGER,
|
||||
mul_mat_q INTEGER,
|
||||
no_kv_offload INTEGER,
|
||||
flash_attn INTEGER,
|
||||
tensor_split TEXT,
|
||||
use_mmap INTEGER,
|
||||
embeddings INTEGER,
|
||||
n_prompt INTEGER,
|
||||
n_gen INTEGER,
|
||||
n_depth INTEGER,
|
||||
test_time TEXT,
|
||||
avg_ns INTEGER,
|
||||
stddev_ns INTEGER,
|
||||
|
@ -297,6 +334,6 @@ CREATE TABLE IF NOT EXISTS test (
|
|||
stddev_ts REAL
|
||||
);
|
||||
|
||||
INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
|
||||
INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
|
||||
INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '512', '0', '0', '2025-04-24T12:00:08Z', '69905000', '519516', '7324.546977', '54.032613');
|
||||
INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '0', '128', '0', '2025-04-24T12:00:09Z', '1063608780', '4464130', '120.346696', '0.504647');
|
||||
```
|
||||
|
|
|
@ -200,6 +200,7 @@ struct cmd_params {
|
|||
std::vector<int> n_prompt;
|
||||
std::vector<int> n_gen;
|
||||
std::vector<std::pair<int, int>> n_pg;
|
||||
std::vector<int> n_depth;
|
||||
std::vector<int> n_batch;
|
||||
std::vector<int> n_ubatch;
|
||||
std::vector<ggml_type> type_k;
|
||||
|
@ -233,6 +234,7 @@ static const cmd_params cmd_params_defaults = {
|
|||
/* n_prompt */ { 512 },
|
||||
/* n_gen */ { 128 },
|
||||
/* n_pg */ {},
|
||||
/* n_depth */ { 0 },
|
||||
/* n_batch */ { 2048 },
|
||||
/* n_ubatch */ { 512 },
|
||||
/* type_k */ { GGML_TYPE_F16 },
|
||||
|
@ -272,6 +274,7 @@ static void print_usage(int /* argc */, char ** argv) {
|
|||
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
||||
printf(" -pg <pp,tg> (default: %s)\n",
|
||||
join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
|
||||
printf(" -d, --n-depth <n> (default: %s)\n", join(cmd_params_defaults.n_depth, ",").c_str());
|
||||
printf(" -b, --batch-size <n> (default: %s)\n",
|
||||
join(cmd_params_defaults.n_batch, ",").c_str());
|
||||
printf(" -ub, --ubatch-size <n> (default: %s)\n",
|
||||
|
@ -409,6 +412,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||
break;
|
||||
}
|
||||
params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) });
|
||||
} else if (arg == "-d" || arg == "--n-depth") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = string_split<int>(argv[i], split_delim);
|
||||
params.n_depth.insert(params.n_depth.end(), p.begin(), p.end());
|
||||
} else if (arg == "-b" || arg == "--batch-size") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
|
@ -739,6 +749,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||
if (params.n_pg.empty()) {
|
||||
params.n_pg = cmd_params_defaults.n_pg;
|
||||
}
|
||||
if (params.n_depth.empty()) {
|
||||
params.n_depth = cmd_params_defaults.n_depth;
|
||||
}
|
||||
if (params.n_batch.empty()) {
|
||||
params.n_batch = cmd_params_defaults.n_batch;
|
||||
}
|
||||
|
@ -801,6 +814,7 @@ struct cmd_params_instance {
|
|||
std::string model;
|
||||
int n_prompt;
|
||||
int n_gen;
|
||||
int n_depth;
|
||||
int n_batch;
|
||||
int n_ubatch;
|
||||
ggml_type type_k;
|
||||
|
@ -880,7 +894,7 @@ struct cmd_params_instance {
|
|||
llama_context_params to_llama_cparams() const {
|
||||
llama_context_params cparams = llama_context_default_params();
|
||||
|
||||
cparams.n_ctx = n_prompt + n_gen;
|
||||
cparams.n_ctx = n_prompt + n_gen + n_depth;
|
||||
cparams.n_batch = n_batch;
|
||||
cparams.n_ubatch = n_ubatch;
|
||||
cparams.type_k = type_k;
|
||||
|
@ -916,6 +930,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|||
for (const auto & nt : params.n_threads)
|
||||
for (const auto & cm : params.cpu_mask)
|
||||
for (const auto & cs : params.cpu_strict)
|
||||
for (const auto & nd : params.n_depth)
|
||||
for (const auto & pl : params.poll) {
|
||||
for (const auto & n_prompt : params.n_prompt) {
|
||||
if (n_prompt == 0) {
|
||||
|
@ -925,6 +940,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|||
/* .model = */ m,
|
||||
/* .n_prompt = */ n_prompt,
|
||||
/* .n_gen = */ 0,
|
||||
/* .n_depth = */ nd,
|
||||
/* .n_batch = */ nb,
|
||||
/* .n_ubatch = */ nub,
|
||||
/* .type_k = */ tk,
|
||||
|
@ -955,6 +971,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|||
/* .model = */ m,
|
||||
/* .n_prompt = */ 0,
|
||||
/* .n_gen = */ n_gen,
|
||||
/* .n_depth = */ nd,
|
||||
/* .n_batch = */ nb,
|
||||
/* .n_ubatch = */ nub,
|
||||
/* .type_k = */ tk,
|
||||
|
@ -985,6 +1002,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|||
/* .model = */ m,
|
||||
/* .n_prompt = */ n_pg.first,
|
||||
/* .n_gen = */ n_pg.second,
|
||||
/* .n_depth = */ nd,
|
||||
/* .n_batch = */ nb,
|
||||
/* .n_ubatch = */ nub,
|
||||
/* .type_k = */ tk,
|
||||
|
@ -1040,6 +1058,7 @@ struct test {
|
|||
bool embeddings;
|
||||
int n_prompt;
|
||||
int n_gen;
|
||||
int n_depth;
|
||||
std::string test_time;
|
||||
std::vector<uint64_t> samples_ns;
|
||||
|
||||
|
@ -1072,6 +1091,7 @@ struct test {
|
|||
embeddings = inst.embeddings;
|
||||
n_prompt = inst.n_prompt;
|
||||
n_gen = inst.n_gen;
|
||||
n_depth = inst.n_depth;
|
||||
// RFC 3339 date-time format
|
||||
time_t t = time(NULL);
|
||||
std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
|
||||
|
@ -1113,9 +1133,11 @@ struct test {
|
|||
"build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename",
|
||||
"model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
|
||||
"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
|
||||
"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "use_mmap",
|
||||
"embeddings", "n_prompt", "n_gen", "n_depth", "test_time", "avg_ns",
|
||||
"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides",
|
||||
"use_mmap", "embeddings", "n_prompt", "n_gen", "test_time", "avg_ns",
|
||||
"stddev_ns", "avg_ts", "stddev_ts",
|
||||
"use_mmap", "embeddings", "n_prompt", "n_gen", "n_depth", "test_time",
|
||||
"avg_ns", "stddev_ns", "avg_ts", "stddev_ts",
|
||||
};
|
||||
return fields;
|
||||
}
|
||||
|
@ -1125,8 +1147,8 @@ struct test {
|
|||
static field_type get_field_type(const std::string & field) {
|
||||
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
|
||||
field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
|
||||
field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_ns" ||
|
||||
field == "stddev_ns") {
|
||||
field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" ||
|
||||
field == "avg_ns" || field == "stddev_ns") {
|
||||
return INT;
|
||||
}
|
||||
if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
|
||||
|
@ -1204,6 +1226,7 @@ struct test {
|
|||
std::to_string(embeddings),
|
||||
std::to_string(n_prompt),
|
||||
std::to_string(n_gen),
|
||||
std::to_string(n_depth),
|
||||
test_time,
|
||||
std::to_string(avg_ns()),
|
||||
std::to_string(stdev_ns()),
|
||||
|
@ -1381,7 +1404,7 @@ struct markdown_printer : public printer {
|
|||
return 4;
|
||||
}
|
||||
if (field == "test") {
|
||||
return 13;
|
||||
return 15;
|
||||
}
|
||||
|
||||
int width = std::max((int) field.length(), 10);
|
||||
|
@ -1531,6 +1554,10 @@ struct markdown_printer : public printer {
|
|||
} else {
|
||||
snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
|
||||
}
|
||||
if (t.n_depth > 0) {
|
||||
int len = strlen(buf);
|
||||
snprintf(buf + len, sizeof(buf) - len, " @ d%d", t.n_depth);
|
||||
}
|
||||
value = buf;
|
||||
} else if (field == "t/s") {
|
||||
snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
|
||||
|
@ -1789,6 +1816,14 @@ int main(int argc, char ** argv) {
|
|||
for (int i = 0; i < params.reps; i++) {
|
||||
llama_kv_self_clear(ctx);
|
||||
|
||||
if (t.n_depth > 0) {
|
||||
if (params.progress) {
|
||||
fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d\n", params_idx, params_count,
|
||||
i + 1, params.reps);
|
||||
}
|
||||
test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);
|
||||
}
|
||||
|
||||
uint64_t t_start = get_time_ns();
|
||||
|
||||
if (t.n_prompt > 0) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue