llama : move end-user examples to tools directory (#13249)
* llama : move end-user examples to tools directory --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
This commit is contained in:
parent
b34443923c
commit
1d36b3670b
213 changed files with 226 additions and 190 deletions
119
tools/server/bench/README.md
Normal file
119
tools/server/bench/README.md
Normal file
|
@ -0,0 +1,119 @@
|
|||
### Server benchmark tools
|
||||
|
||||
Benchmark is using [k6](https://k6.io/).
|
||||
|
||||
##### Install k6 and sse extension
|
||||
|
||||
SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension.
|
||||
|
||||
Example (assuming golang >= 1.21 is installed):
|
||||
```shell
|
||||
go install go.k6.io/xk6/cmd/xk6@latest
|
||||
$GOPATH/bin/xk6 build master \
|
||||
--with github.com/phymbert/xk6-sse
|
||||
```
|
||||
|
||||
#### Download a dataset
|
||||
|
||||
This dataset was originally proposed in [vLLM benchmarks](https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md).
|
||||
|
||||
```shell
|
||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
```
|
||||
|
||||
#### Download a model
|
||||
Example for PHI-2
|
||||
|
||||
```shell
|
||||
../../../scripts/hf.sh --repo ggml-org/models --file phi-2/ggml-model-q4_0.gguf
|
||||
```
|
||||
|
||||
#### Start the server
|
||||
The server must answer OAI Chat completion requests on `http://localhost:8080/v1` or according to the environment variable `SERVER_BENCH_URL`.
|
||||
|
||||
Example:
|
||||
```shell
|
||||
llama-server --host localhost --port 8080 \
|
||||
--model ggml-model-q4_0.gguf \
|
||||
--cont-batching \
|
||||
--metrics \
|
||||
--parallel 8 \
|
||||
--batch-size 512 \
|
||||
--ctx-size 4096 \
|
||||
-ngl 33
|
||||
```
|
||||
|
||||
#### Run the benchmark
|
||||
|
||||
For 500 chat completions request with 8 concurrent users during maximum 10 minutes, run:
|
||||
```shell
|
||||
./k6 run script.js --duration 10m --iterations 500 --vus 8
|
||||
```
|
||||
|
||||
The benchmark values can be overridden with:
|
||||
- `SERVER_BENCH_URL` server url prefix for chat completions, default `http://localhost:8080/v1`
|
||||
- `SERVER_BENCH_N_PROMPTS` total prompts to randomly select in the benchmark, default `480`
|
||||
- `SERVER_BENCH_MODEL_ALIAS` model alias to pass in the completion request, default `my-model`
|
||||
- `SERVER_BENCH_MAX_TOKENS` max tokens to predict, default: `512`
|
||||
- `SERVER_BENCH_DATASET` path to the benchmark dataset file
|
||||
- `SERVER_BENCH_MAX_PROMPT_TOKENS` maximum prompt tokens to filter out in the dataset: default `1024`
|
||||
- `SERVER_BENCH_MAX_CONTEXT` maximum context size of the completions request to filter out in the dataset: prompt + predicted tokens, default `2048`
|
||||
|
||||
Note: the local tokenizer is just a string space split, real number of tokens will differ.
|
||||
|
||||
Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/):
|
||||
|
||||
```shell
|
||||
SERVER_BENCH_N_PROMPTS=500 k6 run script.js --duration 10m --iterations 500 --vus 8
|
||||
```
|
||||
|
||||
To [debug http request](https://k6.io/docs/using-k6/http-debugging/) use `--http-debug="full"`.
|
||||
|
||||
#### Metrics
|
||||
|
||||
Following metrics are available computed from the OAI chat completions response `usage`:
|
||||
- `llamacpp_tokens_second` Trend of `usage.total_tokens / request duration`
|
||||
- `llamacpp_prompt_tokens` Trend of `usage.prompt_tokens`
|
||||
- `llamacpp_prompt_tokens_total_counter` Counter of `usage.prompt_tokens`
|
||||
- `llamacpp_completion_tokens` Trend of `usage.completion_tokens`
|
||||
- `llamacpp_completion_tokens_total_counter` Counter of `usage.completion_tokens`
|
||||
- `llamacpp_completions_truncated_rate` Rate of completions truncated, i.e. if `finish_reason === 'length'`
|
||||
- `llamacpp_completions_stop_rate` Rate of completions stopped by the model, i.e. if `finish_reason === 'stop'`
|
||||
|
||||
The script will fail if too many completions are truncated, see `llamacpp_completions_truncated_rate`.
|
||||
|
||||
K6 metrics might be compared against [server metrics](../README.md), with:
|
||||
|
||||
```shell
|
||||
curl http://localhost:8080/metrics
|
||||
```
|
||||
|
||||
### Using the CI python script
|
||||
The `bench.py` script does several steps:
|
||||
- start the server
|
||||
- define good variable for k6
|
||||
- run k6 script
|
||||
- extract metrics from prometheus
|
||||
|
||||
It aims to be used in the CI, but you can run it manually:
|
||||
|
||||
```shell
|
||||
LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/llama-server python bench.py \
|
||||
--runner-label local \
|
||||
--name local \
|
||||
--branch `git rev-parse --abbrev-ref HEAD` \
|
||||
--commit `git rev-parse HEAD` \
|
||||
--scenario script.js \
|
||||
--duration 5m \
|
||||
--hf-repo ggml-org/models \
|
||||
--hf-file phi-2/ggml-model-q4_0.gguf \
|
||||
--model-path-prefix models \
|
||||
--parallel 4 \
|
||||
-ngl 33 \
|
||||
--batch-size 2048 \
|
||||
--ubatch-size 256 \
|
||||
--ctx-size 4096 \
|
||||
--n-prompts 200 \
|
||||
--max-prompt-tokens 256 \
|
||||
--max-tokens 256
|
||||
```
|
323
tools/server/bench/bench.py
Normal file
323
tools/server/bench/bench.py
Normal file
|
@ -0,0 +1,323 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import signal
|
||||
import socket
|
||||
import subprocess
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import traceback
|
||||
from contextlib import closing
|
||||
from datetime import datetime
|
||||
|
||||
import matplotlib
|
||||
import matplotlib.dates
|
||||
import matplotlib.pyplot as plt
|
||||
import requests
|
||||
from statistics import mean
|
||||
|
||||
|
||||
def main(args_in: list[str] | None = None) -> None:
|
||||
parser = argparse.ArgumentParser(description="Start server benchmark scenario")
|
||||
parser.add_argument("--name", type=str, help="Bench name", required=True)
|
||||
parser.add_argument("--runner-label", type=str, help="Runner label", required=True)
|
||||
parser.add_argument("--branch", type=str, help="Branch name", default="detached")
|
||||
parser.add_argument("--commit", type=str, help="Commit name", default="dirty")
|
||||
parser.add_argument("--host", type=str, help="Server listen host", default="0.0.0.0")
|
||||
parser.add_argument("--port", type=int, help="Server listen host", default="8080")
|
||||
parser.add_argument("--model-path-prefix", type=str, help="Prefix where to store the model files", default="models")
|
||||
parser.add_argument("--n-prompts", type=int,
|
||||
help="SERVER_BENCH_N_PROMPTS: total prompts to randomly select in the benchmark", required=True)
|
||||
parser.add_argument("--max-prompt-tokens", type=int,
|
||||
help="SERVER_BENCH_MAX_PROMPT_TOKENS: maximum prompt tokens to filter out in the dataset",
|
||||
required=True)
|
||||
parser.add_argument("--max-tokens", type=int,
|
||||
help="SERVER_BENCH_MAX_CONTEXT: maximum context size of the completions request to filter out in the dataset: prompt + predicted tokens",
|
||||
required=True)
|
||||
parser.add_argument("--hf-repo", type=str, help="Hugging Face model repository", required=True)
|
||||
parser.add_argument("--hf-file", type=str, help="Hugging Face model file", required=True)
|
||||
parser.add_argument("-ngl", "--n-gpu-layers", type=int, help="layers to the GPU for computation", required=True)
|
||||
parser.add_argument("--ctx-size", type=int, help="Set the size of the prompt context", required=True)
|
||||
parser.add_argument("--parallel", type=int, help="Set the number of slots for process requests", required=True)
|
||||
parser.add_argument("--batch-size", type=int, help="Set the batch size for prompt processing", required=True)
|
||||
parser.add_argument("--ubatch-size", type=int, help="physical maximum batch size", required=True)
|
||||
parser.add_argument("--scenario", type=str, help="Scenario to run", required=True)
|
||||
parser.add_argument("--duration", type=str, help="Bench scenario", required=True)
|
||||
|
||||
args = parser.parse_args(args_in)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# Start the server and performance scenario
|
||||
try:
|
||||
server_process = start_server(args)
|
||||
except Exception:
|
||||
print("bench: server start error :")
|
||||
traceback.print_exc(file=sys.stdout)
|
||||
sys.exit(1)
|
||||
|
||||
# start the benchmark
|
||||
iterations = 0
|
||||
data = {}
|
||||
try:
|
||||
start_benchmark(args)
|
||||
|
||||
with open("results.github.env", 'w') as github_env:
|
||||
# parse output
|
||||
with open('k6-results.json', 'r') as bench_results:
|
||||
# Load JSON data from file
|
||||
data = json.load(bench_results)
|
||||
for metric_name in data['metrics']:
|
||||
for metric_metric in data['metrics'][metric_name]:
|
||||
value = data['metrics'][metric_name][metric_metric]
|
||||
if isinstance(value, float) or isinstance(value, int):
|
||||
value = round(value, 2)
|
||||
data['metrics'][metric_name][metric_metric]=value
|
||||
github_env.write(
|
||||
f"{escape_metric_name(metric_name)}_{escape_metric_name(metric_metric)}={value}\n")
|
||||
iterations = data['root_group']['checks']['success completion']['passes']
|
||||
|
||||
except Exception:
|
||||
print("bench: error :")
|
||||
traceback.print_exc(file=sys.stdout)
|
||||
|
||||
# Stop the server
|
||||
if server_process:
|
||||
try:
|
||||
print(f"bench: shutting down server pid={server_process.pid} ...")
|
||||
if os.name == 'nt':
|
||||
interrupt = signal.CTRL_C_EVENT
|
||||
else:
|
||||
interrupt = signal.SIGINT
|
||||
server_process.send_signal(interrupt)
|
||||
server_process.wait(0.5)
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
print(f"server still alive after 500ms, force-killing pid={server_process.pid} ...")
|
||||
server_process.kill() # SIGKILL
|
||||
server_process.wait()
|
||||
|
||||
while is_server_listening(args.host, args.port):
|
||||
time.sleep(0.1)
|
||||
|
||||
title = (f"llama.cpp {args.name} on {args.runner_label}\n "
|
||||
f"duration={args.duration} {iterations} iterations")
|
||||
xlabel = (f"{args.hf_repo}/{args.hf_file}\n"
|
||||
f"parallel={args.parallel} ctx-size={args.ctx_size} ngl={args.n_gpu_layers} batch-size={args.batch_size} ubatch-size={args.ubatch_size} pp={args.max_prompt_tokens} pp+tg={args.max_tokens}\n"
|
||||
f"branch={args.branch} commit={args.commit}")
|
||||
|
||||
# Prometheus
|
||||
end_time = time.time()
|
||||
prometheus_metrics = {}
|
||||
if is_server_listening("0.0.0.0", 9090):
|
||||
metrics = ['prompt_tokens_seconds', 'predicted_tokens_seconds',
|
||||
'kv_cache_usage_ratio', 'requests_processing', 'requests_deferred']
|
||||
|
||||
for metric in metrics:
|
||||
resp = requests.get(f"http://localhost:9090/api/v1/query_range",
|
||||
params={'query': 'llamacpp:' + metric, 'start': start_time, 'end': end_time, 'step': 2})
|
||||
|
||||
with open(f"{metric}.json", 'w') as metric_json:
|
||||
metric_json.write(resp.text)
|
||||
|
||||
if resp.status_code != 200:
|
||||
print(f"bench: unable to extract prometheus metric {metric}: {resp.text}")
|
||||
else:
|
||||
metric_data = resp.json()
|
||||
values = metric_data['data']['result'][0]['values']
|
||||
timestamps, metric_values = zip(*values)
|
||||
metric_values = [float(value) for value in metric_values]
|
||||
prometheus_metrics[metric] = metric_values
|
||||
timestamps_dt = [str(datetime.fromtimestamp(int(ts))) for ts in timestamps]
|
||||
plt.figure(figsize=(16, 10), dpi=80)
|
||||
plt.plot(timestamps_dt, metric_values, label=metric)
|
||||
plt.xticks(rotation=0, fontsize=14, horizontalalignment='center', alpha=.7)
|
||||
plt.yticks(fontsize=12, alpha=.7)
|
||||
|
||||
ylabel = f"llamacpp:{metric}"
|
||||
plt.title(title,
|
||||
fontsize=14, wrap=True)
|
||||
plt.grid(axis='both', alpha=.3)
|
||||
plt.ylabel(ylabel, fontsize=22)
|
||||
plt.xlabel(xlabel, fontsize=14, wrap=True)
|
||||
plt.gca().xaxis.set_major_locator(matplotlib.dates.MinuteLocator())
|
||||
plt.gca().xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%m-%d %H:%M:%S"))
|
||||
plt.gcf().autofmt_xdate()
|
||||
|
||||
# Remove borders
|
||||
plt.gca().spines["top"].set_alpha(0.0)
|
||||
plt.gca().spines["bottom"].set_alpha(0.3)
|
||||
plt.gca().spines["right"].set_alpha(0.0)
|
||||
plt.gca().spines["left"].set_alpha(0.3)
|
||||
|
||||
# Save the plot as a jpg image
|
||||
plt.savefig(f'{metric}.jpg', dpi=60)
|
||||
plt.close()
|
||||
|
||||
# Mermaid format in case images upload failed
|
||||
with open(f"{metric}.mermaid", 'w') as mermaid_f:
|
||||
mermaid = (
|
||||
f"""---
|
||||
config:
|
||||
xyChart:
|
||||
titleFontSize: 12
|
||||
width: 900
|
||||
height: 600
|
||||
themeVariables:
|
||||
xyChart:
|
||||
titleColor: "#000000"
|
||||
---
|
||||
xychart-beta
|
||||
title "{title}"
|
||||
y-axis "llamacpp:{metric}"
|
||||
x-axis "llamacpp:{metric}" {int(min(timestamps))} --> {int(max(timestamps))}
|
||||
line [{', '.join([str(round(float(value), 2)) for value in metric_values])}]
|
||||
""")
|
||||
mermaid_f.write(mermaid)
|
||||
|
||||
# 140 chars max for commit status description
|
||||
bench_results = {
|
||||
"i": iterations,
|
||||
"req": {
|
||||
"p95": round(data['metrics']["http_req_duration"]["p(95)"], 2),
|
||||
"avg": round(data['metrics']["http_req_duration"]["avg"], 2),
|
||||
},
|
||||
"pp": {
|
||||
"p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2),
|
||||
"avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2),
|
||||
"0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2) if 'prompt_tokens_seconds' in prometheus_metrics else 0,
|
||||
},
|
||||
"tg": {
|
||||
"p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2),
|
||||
"avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2),
|
||||
"0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2) if 'predicted_tokens_seconds' in prometheus_metrics else 0,
|
||||
},
|
||||
}
|
||||
with open("results.github.env", 'a') as github_env:
|
||||
github_env.write(f"BENCH_RESULTS={json.dumps(bench_results, indent=None, separators=(',', ':') )}\n")
|
||||
github_env.write(f"BENCH_ITERATIONS={iterations}\n")
|
||||
|
||||
title = title.replace('\n', ' ')
|
||||
xlabel = xlabel.replace('\n', ' ')
|
||||
github_env.write(f"BENCH_GRAPH_TITLE={title}\n")
|
||||
github_env.write(f"BENCH_GRAPH_XLABEL={xlabel}\n")
|
||||
|
||||
|
||||
def start_benchmark(args):
|
||||
k6_path = './k6'
|
||||
if 'BENCH_K6_BIN_PATH' in os.environ:
|
||||
k6_path = os.environ['BENCH_K6_BIN_PATH']
|
||||
k6_args = [
|
||||
'run', args.scenario,
|
||||
'--no-color',
|
||||
'--no-connection-reuse',
|
||||
'--no-vu-connection-reuse',
|
||||
]
|
||||
k6_args.extend(['--duration', args.duration])
|
||||
k6_args.extend(['--iterations', args.n_prompts])
|
||||
k6_args.extend(['--vus', args.parallel])
|
||||
k6_args.extend(['--summary-export', 'k6-results.json'])
|
||||
k6_args.extend(['--out', 'csv=k6-results.csv'])
|
||||
args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} "
|
||||
args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]])
|
||||
print(f"bench: starting k6 with: {args}")
|
||||
k6_completed = subprocess.run(args, shell=True, stdout=sys.stdout, stderr=sys.stderr)
|
||||
if k6_completed.returncode != 0:
|
||||
raise Exception("bench: unable to run k6")
|
||||
|
||||
|
||||
def start_server(args):
|
||||
server_process = start_server_background(args)
|
||||
|
||||
attempts = 0
|
||||
max_attempts = 600
|
||||
if 'GITHUB_ACTIONS' in os.environ:
|
||||
max_attempts *= 2
|
||||
|
||||
while not is_server_listening(args.host, args.port):
|
||||
attempts += 1
|
||||
if attempts > max_attempts:
|
||||
assert False, "server not started"
|
||||
print(f"bench: waiting for server to start ...")
|
||||
time.sleep(0.5)
|
||||
|
||||
attempts = 0
|
||||
while not is_server_ready(args.host, args.port):
|
||||
attempts += 1
|
||||
if attempts > max_attempts:
|
||||
assert False, "server not ready"
|
||||
print(f"bench: waiting for server to be ready ...")
|
||||
time.sleep(0.5)
|
||||
|
||||
print("bench: server started and ready.")
|
||||
return server_process
|
||||
|
||||
|
||||
def start_server_background(args):
|
||||
# Start the server
|
||||
server_path = '../../../build/bin/llama-server'
|
||||
if 'LLAMA_SERVER_BIN_PATH' in os.environ:
|
||||
server_path = os.environ['LLAMA_SERVER_BIN_PATH']
|
||||
server_args = [
|
||||
'--host', args.host,
|
||||
'--port', args.port,
|
||||
]
|
||||
server_args.extend(['--hf-repo', args.hf_repo])
|
||||
server_args.extend(['--hf-file', args.hf_file])
|
||||
server_args.extend(['--n-gpu-layers', args.n_gpu_layers])
|
||||
server_args.extend(['--ctx-size', args.ctx_size])
|
||||
server_args.extend(['--parallel', args.parallel])
|
||||
server_args.extend(['--batch-size', args.batch_size])
|
||||
server_args.extend(['--ubatch-size', args.ubatch_size])
|
||||
server_args.extend(['--n-predict', args.max_tokens * 2])
|
||||
server_args.extend(['--defrag-thold', "0.1"])
|
||||
server_args.append('--cont-batching')
|
||||
server_args.append('--metrics')
|
||||
server_args.append('--flash-attn')
|
||||
args = [str(arg) for arg in [server_path, *server_args]]
|
||||
print(f"bench: starting server with: {' '.join(args)}")
|
||||
pkwargs = {
|
||||
'stdout': subprocess.PIPE,
|
||||
'stderr': subprocess.PIPE
|
||||
}
|
||||
server_process = subprocess.Popen(
|
||||
args,
|
||||
**pkwargs) # pyright: ignore[reportArgumentType, reportCallIssue]
|
||||
|
||||
def server_log(in_stream, out_stream):
|
||||
for line in iter(in_stream.readline, b''):
|
||||
print(line.decode('utf-8'), end='', file=out_stream)
|
||||
|
||||
thread_stdout = threading.Thread(target=server_log, args=(server_process.stdout, sys.stdout))
|
||||
thread_stdout.start()
|
||||
thread_stderr = threading.Thread(target=server_log, args=(server_process.stderr, sys.stderr))
|
||||
thread_stderr.start()
|
||||
|
||||
return server_process
|
||||
|
||||
|
||||
def is_server_listening(server_fqdn, server_port):
|
||||
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
|
||||
result = sock.connect_ex((server_fqdn, server_port))
|
||||
_is_server_listening = result == 0
|
||||
if _is_server_listening:
|
||||
print(f"server is listening on {server_fqdn}:{server_port}...")
|
||||
return _is_server_listening
|
||||
|
||||
|
||||
def is_server_ready(server_fqdn, server_port):
|
||||
url = f"http://{server_fqdn}:{server_port}/health"
|
||||
response = requests.get(url)
|
||||
return response.status_code == 200
|
||||
|
||||
|
||||
def escape_metric_name(metric_name):
|
||||
return re.sub('[^A-Z0-9]', '_', metric_name.upper())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
9
tools/server/bench/prometheus.yml
Normal file
9
tools/server/bench/prometheus.yml
Normal file
|
@ -0,0 +1,9 @@
|
|||
global:
|
||||
scrape_interval: 10s
|
||||
external_labels:
|
||||
llamacpp: 'server'
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'llama.cpp server'
|
||||
static_configs:
|
||||
- targets: ['localhost:8080']
|
2
tools/server/bench/requirements.txt
Normal file
2
tools/server/bench/requirements.txt
Normal file
|
@ -0,0 +1,2 @@
|
|||
matplotlib
|
||||
requests
|
162
tools/server/bench/script.js
Normal file
162
tools/server/bench/script.js
Normal file
|
@ -0,0 +1,162 @@
|
|||
import sse from 'k6/x/sse'
|
||||
import {check, sleep} from 'k6'
|
||||
import {SharedArray} from 'k6/data'
|
||||
import {Counter, Rate, Trend} from 'k6/metrics'
|
||||
import exec from 'k6/execution';
|
||||
|
||||
// Server chat completions prefix
|
||||
const server_url = __ENV.SERVER_BENCH_URL ? __ENV.SERVER_BENCH_URL : 'http://localhost:8080/v1'
|
||||
|
||||
// Number of total prompts in the dataset - default 10m / 10 seconds/request * number of users
|
||||
const n_prompt = __ENV.SERVER_BENCH_N_PROMPTS ? parseInt(__ENV.SERVER_BENCH_N_PROMPTS) : 600 / 10 * 8
|
||||
|
||||
// Model name to request
|
||||
const model = __ENV.SERVER_BENCH_MODEL_ALIAS ? __ENV.SERVER_BENCH_MODEL_ALIAS : 'my-model'
|
||||
|
||||
// Dataset path
|
||||
const dataset_path = __ENV.SERVER_BENCH_DATASET ? __ENV.SERVER_BENCH_DATASET : './ShareGPT_V3_unfiltered_cleaned_split.json'
|
||||
|
||||
// Max tokens to predict
|
||||
const max_tokens = __ENV.SERVER_BENCH_MAX_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_TOKENS) : 512
|
||||
|
||||
// Max prompt tokens
|
||||
const n_prompt_tokens = __ENV.SERVER_BENCH_MAX_PROMPT_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_PROMPT_TOKENS) : 1024
|
||||
|
||||
// Max slot context
|
||||
const n_ctx_slot = __ENV.SERVER_BENCH_MAX_CONTEXT ? parseInt(__ENV.SERVER_BENCH_MAX_CONTEXT) : 2048
|
||||
|
||||
export function setup() {
|
||||
console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path} max_tokens=${max_tokens}`)
|
||||
}
|
||||
|
||||
const data = new SharedArray('conversations', function () {
|
||||
const tokenizer = (message) => message.split(/[\s,'".?]/)
|
||||
|
||||
return JSON.parse(open(dataset_path))
|
||||
// Filter out the conversations with less than 2 turns.
|
||||
.filter(data => data["conversations"].length >= 2)
|
||||
.filter(data => data["conversations"][0]["from"] === "human")
|
||||
.map(data => {
|
||||
return {
|
||||
prompt: data["conversations"][0]["value"],
|
||||
n_prompt_tokens: tokenizer(data["conversations"][0]["value"]).length,
|
||||
n_completion_tokens: tokenizer(data["conversations"][1]["value"]).length,
|
||||
}
|
||||
})
|
||||
// Filter out too short sequences
|
||||
.filter(conv => conv.n_prompt_tokens >= 4 && conv.n_completion_tokens >= 4)
|
||||
// Filter out too long sequences.
|
||||
.filter(conv => conv.n_prompt_tokens <= n_prompt_tokens && conv.n_prompt_tokens + conv.n_completion_tokens <= n_ctx_slot)
|
||||
// Keep only first n prompts
|
||||
.slice(0, n_prompt)
|
||||
})
|
||||
|
||||
const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens')
|
||||
const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
|
||||
|
||||
const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
|
||||
const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')
|
||||
const llamacpp_emit_first_token_second = new Trend('llamacpp_emit_first_token_second')
|
||||
|
||||
const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
|
||||
const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
|
||||
|
||||
const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate')
|
||||
const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate')
|
||||
|
||||
export const options = {
|
||||
thresholds: {
|
||||
llamacpp_completions_truncated_rate: [
|
||||
// more than 80% of truncated input will abort the test
|
||||
{threshold: 'rate < 0.8', abortOnFail: true, delayAbortEval: '1m'},
|
||||
],
|
||||
},
|
||||
duration: '10m',
|
||||
vus: 8,
|
||||
}
|
||||
|
||||
export default function () {
|
||||
const conversation = data[exec.scenario.iterationInInstance % data.length]
|
||||
const payload = {
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are ChatGPT, an AI assistant.",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": conversation.prompt,
|
||||
}
|
||||
],
|
||||
"model": model,
|
||||
"stream": true,
|
||||
"stream_options": {
|
||||
"include_usage": true, // False to be supported in llama.cpp server
|
||||
},
|
||||
"seed": 42,
|
||||
"max_tokens": max_tokens,
|
||||
"stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS
|
||||
}
|
||||
|
||||
const params = {method: 'POST', body: JSON.stringify(payload)};
|
||||
|
||||
const startTime = new Date()
|
||||
let promptEvalEndTime = null
|
||||
let prompt_tokens = 0
|
||||
let completions_tokens = 0
|
||||
let finish_reason = null
|
||||
const res = sse.open(`${server_url}/chat/completions`, params, function (client) {
|
||||
client.on('event', function (event) {
|
||||
if (promptEvalEndTime == null) {
|
||||
promptEvalEndTime = new Date()
|
||||
llamacpp_emit_first_token_second.add((promptEvalEndTime - startTime) / 1.e3)
|
||||
}
|
||||
|
||||
if (event.data === '[DONE]' || event.data === '') {
|
||||
return
|
||||
}
|
||||
|
||||
let chunk = JSON.parse(event.data)
|
||||
|
||||
if (chunk.choices && chunk.choices.length > 0) {
|
||||
let choice = chunk.choices[0]
|
||||
if (choice.finish_reason) {
|
||||
finish_reason = choice.finish_reason
|
||||
}
|
||||
}
|
||||
|
||||
if (chunk.usage) {
|
||||
prompt_tokens = chunk.usage.prompt_tokens
|
||||
llamacpp_prompt_tokens.add(prompt_tokens)
|
||||
llamacpp_prompt_tokens_total_counter.add(prompt_tokens)
|
||||
|
||||
completions_tokens = chunk.usage.completion_tokens
|
||||
llamacpp_completion_tokens.add(completions_tokens)
|
||||
llamacpp_completion_tokens_total_counter.add(completions_tokens)
|
||||
}
|
||||
})
|
||||
|
||||
client.on('error', function (e) {
|
||||
console.log('An unexpected error occurred: ', e.error());
|
||||
throw e;
|
||||
})
|
||||
})
|
||||
|
||||
check(res, {'success completion': (r) => r.status === 200})
|
||||
|
||||
const endTime = new Date()
|
||||
|
||||
const promptEvalTime = promptEvalEndTime - startTime
|
||||
if (promptEvalTime > 0) {
|
||||
llamacpp_prompt_processing_second.add(prompt_tokens / (promptEvalEndTime - startTime) * 1.e3)
|
||||
}
|
||||
|
||||
const completion_time = endTime - promptEvalEndTime
|
||||
if (completions_tokens > 0 && completion_time > 0) {
|
||||
llamacpp_tokens_second.add(completions_tokens / completion_time * 1.e3)
|
||||
}
|
||||
llamacpp_completions_truncated_rate.add(finish_reason === 'length')
|
||||
llamacpp_completions_stop_rate.add(finish_reason === 'stop')
|
||||
|
||||
sleep(0.3)
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue