123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727 |
- """Benchmark online serving throughput.
- On the server side, run one of the following commands:
- Aphrodite OpenAI API server
- aphrodite run <your_model> \
- --swap-space 16 \
- --disable-log-requests
- (TGI backend)
- ./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
- On the client side, run:
- python tests/benchmarks/serving.py \
- --backend <backend> \
- --model <your_model> \
- --dataset-name sharegpt \
- --dataset-path <path to dataset> \
- --request-rate <request_rate> \ # By default <request_rate> is inf
- --num-prompts <num_prompts> # By default <num_prompts> is 1000
- when using tgi backend, add
- --endpoint /generate_stream
- to the end of the command above.
- """
- import argparse
- import asyncio
- import json
- import os
- import random
- import time
- import warnings
- from dataclasses import dataclass
- from datetime import datetime
- from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
- import numpy as np
- from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
- RequestFuncOutput)
- from tqdm.asyncio import tqdm
- from transformers import PreTrainedTokenizerBase
- try:
- from aphrodite.transformers_utils.tokenizer import get_tokenizer
- except ImportError:
- from backend_request_func import get_tokenizer
- try:
- from aphrodite.common.utils import FlexibleArgumentParser
- except ImportError:
- from argparse import ArgumentParser as FlexibleArgumentParser
- @dataclass
- class BenchmarkMetrics:
- completed: int
- total_input: int
- total_output: int
- request_throughput: float
- input_throughput: float
- output_throughput: float
- mean_ttft_ms: float
- median_ttft_ms: float
- std_ttft_ms: float
- p99_ttft_ms: float
- mean_tpot_ms: float
- median_tpot_ms: float
- std_tpot_ms: float
- p99_tpot_ms: float
- mean_itl_ms: float
- median_itl_ms: float
- std_itl_ms: float
- p99_itl_ms: float
- def sample_sharegpt_requests(
- dataset_path: str,
- num_requests: int,
- tokenizer: PreTrainedTokenizerBase,
- fixed_output_len: Optional[int] = None,
- ) -> List[Tuple[str, int, int]]:
- if fixed_output_len is not None and fixed_output_len < 4:
- raise ValueError("output_len too small")
- # Load the dataset.
- with open(dataset_path) as f:
- dataset = json.load(f)
- # Filter out the conversations with less than 2 turns.
- dataset = [data for data in dataset if len(data["conversations"]) >= 2]
- # Only keep the first two turns of each conversation.
- dataset = [(data["conversations"][0]["value"],
- data["conversations"][1]["value"]) for data in dataset]
- # Shuffle the dataset.
- random.shuffle(dataset)
- # Filter out sequences that are too long or too short
- filtered_dataset: List[Tuple[str, int, int]] = []
- for i in range(len(dataset)):
- if len(filtered_dataset) == num_requests:
- break
- # Tokenize the prompts and completions.
- prompt = dataset[i][0]
- prompt_token_ids = tokenizer(prompt).input_ids
- completion = dataset[i][1]
- completion_token_ids = tokenizer(completion).input_ids
- prompt_len = len(prompt_token_ids)
- output_len = len(completion_token_ids
- ) if fixed_output_len is None else fixed_output_len
- if prompt_len < 4 or output_len < 4:
- # Prune too short sequences.
- continue
- if prompt_len > 1024 or prompt_len + output_len > 2048:
- # Prune too long sequences.
- continue
- filtered_dataset.append((prompt, prompt_len, output_len))
- return filtered_dataset
- def sample_sonnet_requests(
- dataset_path: str,
- num_requests: int,
- input_len: int,
- output_len: int,
- prefix_len: int,
- tokenizer: PreTrainedTokenizerBase,
- ) -> List[Tuple[str, str, int, int]]:
- assert (
- input_len > prefix_len
- ), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
- # Load the dataset.
- with open(dataset_path) as f:
- poem_lines = f.readlines()
- # Tokenize the poem lines.
- poem_token_ids = tokenizer(poem_lines).input_ids
- average_poem_len = sum(
- len(token_ids) for token_ids in poem_token_ids) / len(poem_token_ids)
- # Base prefix for all requests.
- base_prompt = "Pick as many lines as you can from these poem lines:\n"
- base_message = [{
- "role": "user",
- "content": base_prompt,
- }]
- base_prompt_formatted = tokenizer.apply_chat_template(
- base_message, add_generation_prompt=True, tokenize=False)
- base_prompt_offset = len(tokenizer(base_prompt_formatted).input_ids)
- assert (
- input_len > base_prompt_offset
- ), f"Please set 'args.sonnet-input-len' higher than {base_prompt_offset}."
- num_input_lines = round(
- (input_len - base_prompt_offset) / average_poem_len)
- # First approximately `prefix_len` number of tokens in the
- # prompt are fixed poem lines.
- assert (
- prefix_len > base_prompt_offset
- ), f"Please set 'args.sonnet-prefix-len' higher than {base_prompt_offset}."
- num_prefix_lines = round(
- (prefix_len - base_prompt_offset) / average_poem_len)
- prefix_lines = poem_lines[:num_prefix_lines]
- # Sample the rest of lines per request.
- sampled_requests: List[Tuple[str, int, int]] = []
- for _ in range(num_requests):
- sampled_lines = "".join(
- prefix_lines +
- random.sample(poem_lines, num_input_lines - num_prefix_lines))
- prompt = f"{base_prompt}{sampled_lines}"
- message = [
- {
- "role": "user",
- "content": prompt,
- },
- ]
- prompt_formatted = tokenizer.apply_chat_template(
- message, add_generation_prompt=True, tokenize=False)
- prompt_len = len(tokenizer(prompt_formatted).input_ids)
- sampled_requests.append(
- (prompt, prompt_formatted, prompt_len, output_len))
- return sampled_requests
- def sample_random_requests(
- input_len: int, output_len: int, num_prompts: int, range_ratio: float,
- tokenizer: PreTrainedTokenizerBase) -> List[Tuple[str, int, int]]:
- input_lens = np.random.randint(
- int(input_len * range_ratio),
- input_len + 1,
- size=num_prompts,
- )
- output_lens = np.random.randint(
- int(output_len * range_ratio),
- output_len + 1,
- size=num_prompts,
- )
- offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
- input_requests = []
- for i in range(num_prompts):
- prompt = tokenizer.decode([(offsets[i] + i + j) % tokenizer.vocab_size
- for j in range(input_lens[i])])
- input_requests.append(
- (prompt, int(input_lens[i]), int(output_lens[i])))
- return input_requests
- async def get_request(
- input_requests: List[Tuple[str, int, int]],
- request_rate: float,
- ) -> AsyncGenerator[Tuple[str, int, int], None]:
- input_requests = iter(input_requests)
- for request in input_requests:
- yield request
- if request_rate == float("inf"):
- # If the request rate is infinity, then we don't need to wait.
- continue
- # Sample the request interval from the exponential distribution.
- interval = np.random.exponential(1.0 / request_rate)
- # The next request will be sent after the interval.
- await asyncio.sleep(interval)
- def calculate_metrics(
- input_requests: List[Tuple[str, int, int]],
- outputs: List[RequestFuncOutput],
- dur_s: float,
- tokenizer: PreTrainedTokenizerBase,
- ) -> Tuple[BenchmarkMetrics, List[int]]:
- actual_output_lens: List[int] = []
- total_input = 0
- completed = 0
- itls: List[float] = []
- tpots: List[float] = []
- ttfts: List[float] = []
- for i in range(len(outputs)):
- if outputs[i].success:
- # We use the tokenizer to count the number of output tokens for all
- # serving backends instead of looking at len(outputs[i].itl) since
- # multiple output tokens may be bundled together
- # Note : this may inflate the output token count slightly
- output_len = len(
- tokenizer(outputs[i].generated_text,
- add_special_tokens=False).input_ids)
- actual_output_lens.append(output_len)
- total_input += input_requests[i][1]
- if output_len > 1:
- tpots.append(
- (outputs[i].latency - outputs[i].ttft) / (output_len - 1))
- itls += outputs[i].itl
- ttfts.append(outputs[i].ttft)
- completed += 1
- else:
- actual_output_lens.append(0)
- if completed == 0:
- warnings.warn(
- "All requests failed. This is likely due to a misconfiguration "
- "on the benchmark arguments.",
- stacklevel=2)
- metrics = BenchmarkMetrics(
- completed=completed,
- total_input=total_input,
- total_output=sum(actual_output_lens),
- request_throughput=completed / dur_s,
- input_throughput=total_input / dur_s,
- output_throughput=sum(actual_output_lens) / dur_s,
- mean_ttft_ms=np.mean(ttfts or 0) *
- 1000, # ttfts is empty if streaming is not supported by backend
- median_ttft_ms=np.median(ttfts or 0) * 1000,
- std_ttft_ms=np.std(ttfts or 0) * 1000,
- p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
- mean_tpot_ms=np.mean(tpots or 0) * 1000,
- median_tpot_ms=np.median(tpots or 0) * 1000,
- std_tpot_ms=np.std(tpots or 0) * 1000,
- p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
- mean_itl_ms=np.mean(itls or 0) * 1000,
- median_itl_ms=np.median(itls or 0) * 1000,
- std_itl_ms=np.std(itls or 0) * 1000,
- p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
- )
- return metrics, actual_output_lens
- async def benchmark(
- backend: str,
- api_url: str,
- model_id: str,
- tokenizer: PreTrainedTokenizerBase,
- input_requests: List[Tuple[str, int, int]],
- best_of: int,
- use_beam_search: bool,
- request_rate: float,
- disable_tqdm: bool,
- ):
- if backend in ASYNC_REQUEST_FUNCS:
- request_func = ASYNC_REQUEST_FUNCS[backend]
- else:
- raise ValueError(f"Unknown backend: {backend}")
- print("Starting initial single prompt test run...")
- test_prompt, test_prompt_len, test_output_len = input_requests[0]
- test_input = RequestFuncInput(
- model=model_id,
- prompt=test_prompt,
- api_url=api_url,
- prompt_len=test_prompt_len,
- output_len=test_output_len,
- best_of=best_of,
- use_beam_search=use_beam_search,
- )
- test_output = await request_func(request_func_input=test_input)
- if not test_output.success:
- raise ValueError(
- "Initial test run failed - Please make sure benchmark arguments "
- f"are correctly specified. Error: {test_output.error}")
- else:
- print("Initial test run completed. Starting main benchmark run...")
- print(f"Traffic request rate: {request_rate}")
- pbar = None if disable_tqdm else tqdm(total=len(input_requests))
- benchmark_start_time = time.perf_counter()
- tasks: List[asyncio.Task] = []
- async for request in get_request(input_requests, request_rate):
- prompt, prompt_len, output_len = request
- request_func_input = RequestFuncInput(
- model=model_id,
- prompt=prompt,
- api_url=api_url,
- prompt_len=prompt_len,
- output_len=output_len,
- best_of=best_of,
- use_beam_search=use_beam_search,
- )
- tasks.append(
- asyncio.create_task(
- request_func(request_func_input=request_func_input,
- pbar=pbar)))
- outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
- if pbar is not None:
- pbar.close()
- benchmark_duration = time.perf_counter() - benchmark_start_time
- metrics, actual_output_lens = calculate_metrics(
- input_requests=input_requests,
- outputs=outputs,
- dur_s=benchmark_duration,
- tokenizer=tokenizer,
- )
- print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
- print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
- print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
- benchmark_duration))
- print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
- print("{:<40} {:<10}".format("Total generated tokens:",
- metrics.total_output))
- print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
- metrics.request_throughput))
- print("{:<40} {:<10.2f}".format("Input token throughput (tok/s):",
- metrics.input_throughput))
- print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
- metrics.output_throughput))
- print("{s:{c}^{n}}".format(s='Time to First Token', n=50, c='-'))
- print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms))
- print("{:<40} {:<10.2f}".format("Median TTFT (ms):",
- metrics.median_ttft_ms))
- print("{:<40} {:<10.2f}".format("P99 TTFT (ms):", metrics.p99_ttft_ms))
- print("{s:{c}^{n}}".format(s='Time per Output Token (excl. 1st token)',
- n=50,
- c='-'))
- print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", metrics.mean_tpot_ms))
- print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
- metrics.median_tpot_ms))
- print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
- print("{s:{c}^{n}}".format(s='Inter-token Latency', n=50, c='-'))
- print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
- print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms))
- print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
- print("=" * 50)
- result = {
- "duration": benchmark_duration,
- "completed": metrics.completed,
- "total_input_tokens": metrics.total_input,
- "total_output_tokens": metrics.total_output,
- "request_throughput": metrics.request_throughput,
- "input_throughput": metrics.input_throughput,
- "output_throughput": metrics.output_throughput,
- "mean_ttft_ms": metrics.mean_ttft_ms,
- "median_ttft_ms": metrics.median_ttft_ms,
- "std_ttft_ms": metrics.std_ttft_ms,
- "p99_ttft_ms": metrics.p99_ttft_ms,
- "mean_tpot_ms": metrics.mean_tpot_ms,
- "median_tpot_ms": metrics.median_tpot_ms,
- "std_tpot_ms": metrics.std_tpot_ms,
- "p99_tpot_ms": metrics.p99_tpot_ms,
- "mean_itl_ms": metrics.mean_itl_ms,
- "median_itl_ms": metrics.median_itl_ms,
- "std_itl_ms": metrics.std_itl_ms,
- "p99_itl_ms": metrics.p99_itl_ms,
- "input_lens": [output.prompt_len for output in outputs],
- "output_lens": actual_output_lens,
- "ttfts": [output.ttft for output in outputs],
- "itls": [output.itl for output in outputs],
- "generated_texts": [output.generated_text for output in outputs],
- "errors": [output.error for output in outputs],
- }
- return result
- def main(args: argparse.Namespace):
- print(args)
- random.seed(args.seed)
- np.random.seed(args.seed)
- backend = args.backend
- model_id = args.model
- tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
- if args.base_url is not None:
- api_url = f"{args.base_url}{args.endpoint}"
- else:
- api_url = f"http://{args.host}:{args.port}{args.endpoint}"
- tokenizer = get_tokenizer(tokenizer_id,
- trust_remote_code=args.trust_remote_code)
- if args.dataset is not None:
- warnings.warn(
- "The '--dataset' argument will be deprecated in the next "
- "release. Please use '--dataset-name' and "
- "'--dataset-path' in the future runs.",
- stacklevel=2)
- input_requests = sample_sharegpt_requests(
- dataset_path=args.dataset,
- num_requests=args.num_prompts,
- tokenizer=tokenizer,
- fixed_output_len=args.sharegpt_output_len,
- )
- elif args.dataset_name == "sharegpt":
- input_requests = sample_sharegpt_requests(
- dataset_path=args.dataset_path,
- num_requests=args.num_prompts,
- tokenizer=tokenizer,
- fixed_output_len=args.sharegpt_output_len,
- )
- elif args.dataset_name == "sonnet":
- # Do not format the prompt, pass to message directly
- if args.backend == "openai-chat":
- input_requests = sample_sonnet_requests(
- dataset_path=args.dataset_path,
- num_requests=args.num_prompts,
- input_len=args.sonnet_input_len,
- output_len=args.sonnet_output_len,
- prefix_len=args.sonnet_prefix_len,
- tokenizer=tokenizer,
- )
- input_requests = [(prompt, prompt_len, output_len)
- for prompt, prompt_formatted, prompt_len,
- output_len in input_requests]
- else:
- assert (
- tokenizer.chat_template or tokenizer.default_chat_template
- ), "Tokenizer/model must have chat template for sonnet dataset."
- input_requests = sample_sonnet_requests(
- dataset_path=args.dataset_path,
- num_requests=args.num_prompts,
- input_len=args.sonnet_input_len,
- output_len=args.sonnet_output_len,
- prefix_len=args.sonnet_prefix_len,
- tokenizer=tokenizer,
- )
- input_requests = [(prompt_formatted, prompt_len, output_len)
- for prompt, prompt_formatted, prompt_len,
- output_len in input_requests]
- elif args.dataset_name == "random":
- input_requests = sample_random_requests(
- input_len=args.random_input_len,
- output_len=args.random_output_len,
- num_prompts=args.num_prompts,
- range_ratio=args.random_range_ratio,
- tokenizer=tokenizer,
- )
- else:
- raise ValueError(f"Unknown dataset: {args.dataset_name}")
- benchmark_result = asyncio.run(
- benchmark(
- backend=backend,
- api_url=api_url,
- model_id=model_id,
- tokenizer=tokenizer,
- input_requests=input_requests,
- best_of=args.best_of,
- use_beam_search=args.use_beam_search,
- request_rate=args.request_rate,
- disable_tqdm=args.disable_tqdm,
- ))
- # Save config and results to json
- if args.save_result:
- result_json: Dict[str, Any] = {}
- # Setup
- current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
- result_json["date"] = current_dt
- result_json["backend"] = backend
- result_json["model_id"] = model_id
- result_json["tokenizer_id"] = tokenizer_id
- result_json["best_of"] = args.best_of
- result_json["use_beam_search"] = args.use_beam_search
- result_json["num_prompts"] = args.num_prompts
- # Metadata
- if args.metadata:
- for item in args.metadata:
- if "=" in item:
- kvstring = item.split("=")
- result_json[kvstring[0].strip()] = kvstring[1].strip()
- else:
- raise ValueError(
- "Invalid metadata format. Please use KEY=VALUE format."
- )
- # Traffic
- result_json["request_rate"] = (
- args.request_rate if args.request_rate < float("inf") else "inf")
- # Merge with benchmark result
- result_json = {**result_json, **benchmark_result}
- # Save to file
- base_model_id = model_id.split("/")[-1]
- file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" #noqa
- if args.result_filename:
- file_name = args.result_filename
- if args.result_dir:
- file_name = os.path.join(args.result_dir, file_name)
- with open(file_name, "w") as outfile:
- json.dump(result_json, outfile)
- if __name__ == "__main__":
- parser = FlexibleArgumentParser(
- description="Benchmark the online serving throughput.")
- parser.add_argument(
- "--backend",
- type=str,
- default="aphrodite",
- choices=list(ASYNC_REQUEST_FUNCS.keys()),
- )
- parser.add_argument(
- "--base-url",
- type=str,
- default=None,
- help="Server or API base url if not using http host and port.",
- )
- parser.add_argument("--host", type=str, default="localhost")
- parser.add_argument("--port", type=int, default=8000)
- parser.add_argument(
- "--endpoint",
- type=str,
- default="/v1/completions",
- help="API endpoint.",
- )
- parser.add_argument(
- "--dataset",
- type=str,
- default=None,
- help="Path to the ShareGPT dataset, will be deprecated in the "
- "next release.",
- )
- parser.add_argument(
- "--dataset-name",
- type=str,
- default="sharegpt",
- choices=["sharegpt", "sonnet", "random"],
- help="Name of the dataset to benchmark on.",
- )
- parser.add_argument("--dataset-path",
- type=str,
- default=None,
- help="Path to the dataset.")
- parser.add_argument(
- "--model",
- type=str,
- required=True,
- help="Name of the model.",
- )
- parser.add_argument(
- "--tokenizer",
- type=str,
- help=
- "Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
- )
- parser.add_argument(
- "--best-of",
- type=int,
- default=1,
- help="Generates `best_of` sequences per prompt and "
- "returns the best one.",
- )
- parser.add_argument("--use-beam-search", action="store_true")
- parser.add_argument(
- "--num-prompts",
- type=int,
- default=1000,
- help="Number of prompts to process.",
- )
- parser.add_argument(
- "--sharegpt-output-len",
- type=int,
- default=None,
- help="Output length for each request. Overrides the output length "
- "from the ShareGPT dataset.")
- parser.add_argument(
- "--sonnet-input-len",
- type=int,
- default=550,
- help=
- "Number of input tokens per request, used only for sonnet dataset.",
- )
- parser.add_argument(
- "--sonnet-output-len",
- type=int,
- default=150,
- help=
- "Number of output tokens per request, used only for sonnet dataset.",
- )
- parser.add_argument(
- "--sonnet-prefix-len",
- type=int,
- default=200,
- help=
- "Number of prefix tokens per request, used only for sonnet dataset.",
- )
- parser.add_argument(
- "--random-input-len",
- type=int,
- default=1024,
- help=
- "Number of input tokens per request, used only for random sampling.",
- )
- parser.add_argument(
- "--random-output-len",
- type=int,
- default=128,
- help=
- "Number of output tokens per request, used only for random sampling.",
- )
- parser.add_argument(
- "--random-range-ratio",
- type=float,
- default=1.0,
- help="Range of sampled ratio of input/output length, "
- "used only for random sampling.",
- )
- parser.add_argument(
- "--request-rate",
- type=float,
- default=float("inf"),
- help="Number of requests per second. If this is inf, "
- "then all the requests are sent at time 0. "
- "Otherwise, we use Poisson process to synthesize "
- "the request arrival times.",
- )
- parser.add_argument("--seed", type=int, default=0)
- parser.add_argument(
- "--trust-remote-code",
- action="store_true",
- help="Trust remote code from huggingface",
- )
- parser.add_argument(
- "--disable-tqdm",
- action="store_true",
- help="Specify to disable tqdm progress bar.",
- )
- parser.add_argument(
- "--save-result",
- action="store_true",
- help="Specify to save benchmark results to a json file",
- )
- parser.add_argument(
- "--metadata",
- metavar="KEY=VALUE",
- nargs="*",
- help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) "
- "for metadata of this run to be saved in the result JSON file "
- "for record keeping purposes.",
- )
- parser.add_argument(
- "--result-dir",
- type=str,
- default=None,
- help="Specify directory to save benchmark json results."
- "If not specified, results are saved in the current directory.",
- )
- parser.add_argument(
- "--result-filename",
- type=str,
- default=None,
- help="Specify the filename to save benchmark json results."
- "If not specified, results will be saved in "
- "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
- " format.",
- )
- args = parser.parse_args()
- main(args)
|