1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253 |
- """
- This file test accuracy of the Aphrodite server via LMEval.
- It uses local-completions, which interacts with Aphrodite
- through the OAI API with N concurrent connections.
- This simulates real work usage of the API and makes
- sure that the zmq frontend mp RPC message passing and
- AsyncAPhroditeEngine are working correctly.
- """
- import lm_eval
- import pytest
- from ...utils import RemoteOpenAIServer
- MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
- NUM_CONCURRENT = 500
- TASK = "gsm8k"
- FILTER = "exact_match,strict-match"
- RTOL = 0.03
- EXPECTED_VALUE = 0.58
- DEFAULT_ARGS = ["--max-model-len", "4096", "--disable-log-requests"]
- MORE_ARGS_LIST = [
- ["--enable-chunked-prefill"], # Chunked
- ["--num-scheduler-steps", "8"], # MS
- ["--num-scheduler-steps", "8", "--multi-step-stream-outputs"] # MS+Stream
- ]
- @pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
- def test_lm_eval_accuracy(more_args):
- args = list(DEFAULT_ARGS)
- args.extend(more_args)
- print(f"Running with: {args}")
- with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
- url = f"{remote_server.url_for('v1')}/completions"
- model_args = (
- f"model={MODEL_NAME},"
- f"base_url={url},"
- f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False")
- results = lm_eval.simple_evaluate(
- model="local-completions",
- model_args=model_args,
- tasks=TASK,
- )
- measured_value = results["results"][TASK][FILTER]
- assert (measured_value - RTOL < EXPECTED_VALUE
- and measured_value + RTOL > EXPECTED_VALUE
- ), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
|