123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152 |
- import openai # use the official client for correctness check
- import pytest
- import requests
- from aphrodite.transformers_utils.tokenizer import get_tokenizer
- from ...utils import RemoteOpenAIServer
- from .test_completion import zephyr_lora_added_tokens_files # noqa: F401
- from .test_completion import zephyr_lora_files # noqa: F401
- # any model with a chat template should work here
- MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
- @pytest.fixture(scope="module")
- def server(zephyr_lora_added_tokens_files: str): # noqa: F811
- args = [
- # use half precision for speed and memory savings in CI environment
- "--dtype",
- "bfloat16",
- "--max-model-len",
- "8192",
- "--enforce-eager",
- "--max-num-seqs",
- "128",
- # lora config
- "--enable-lora",
- "--lora-modules",
- f"zephyr-lora2={zephyr_lora_added_tokens_files}",
- "--max-lora-rank",
- "64",
- ]
- with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
- yield remote_server
- @pytest.fixture(scope="module")
- def tokenizer_name(model_name: str,
- zephyr_lora_added_tokens_files: str): # noqa: F811
- return zephyr_lora_added_tokens_files if (
- model_name == "zephyr-lora2") else model_name
- @pytest.fixture(scope="module")
- def client(server):
- return server.get_async_client()
- @pytest.mark.asyncio
- @pytest.mark.parametrize(
- "model_name,tokenizer_name",
- [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
- indirect=["tokenizer_name"],
- )
- async def test_tokenize_completions(client: openai.AsyncOpenAI,
- model_name: str, tokenizer_name: str):
- base_url = str(client.base_url)[:-3].strip("/")
- tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
- tokenizer_mode="fast")
- for add_special in [False, True]:
- prompt = "aphrodite1 This is a test prompt."
- tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
- response = requests.post(base_url + "/v1/tokenize",
- json={
- "add_special_tokens": add_special,
- "model": model_name,
- "prompt": prompt
- })
- response.raise_for_status()
- assert response.json() == {
- "tokens": tokens,
- "count": len(tokens),
- "max_model_len": 8192
- }
- @pytest.mark.asyncio
- @pytest.mark.parametrize(
- "model_name,tokenizer_name",
- [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
- indirect=["tokenizer_name"],
- )
- async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str,
- tokenizer_name: str):
- base_url = str(client.base_url)[:-3].strip("/")
- tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
- tokenizer_mode="fast")
- for add_generation in [False, True]:
- for add_special in [False, True]:
- conversation = [{
- "role": "user",
- "content": "Hi there!"
- }, {
- "role": "assistant",
- "content": "Nice to meet you!"
- }, {
- "role": "user",
- "content": "Can I ask a question? aphrodite1"
- }]
- prompt = tokenizer.apply_chat_template(
- add_generation_prompt=add_generation,
- conversation=conversation,
- tokenize=False)
- tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
- response = requests.post(base_url + "/v1/tokenize",
- json={
- "add_generation_prompt":
- add_generation,
- "add_special_tokens": add_special,
- "messages": conversation,
- "model": model_name
- })
- response.raise_for_status()
- assert response.json() == {
- "tokens": tokens,
- "count": len(tokens),
- "max_model_len": 8192
- }
- @pytest.mark.asyncio
- @pytest.mark.parametrize(
- "model_name,tokenizer_name",
- [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
- indirect=["tokenizer_name"],
- )
- async def test_detokenize(client: openai.AsyncOpenAI, model_name: str,
- tokenizer_name: str):
- base_url = str(client.base_url)[:-3].strip("/")
- tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
- tokenizer_mode="fast")
- prompt = "This is a test prompt. aphrodite1"
- tokens = tokenizer.encode(prompt, add_special_tokens=False)
- print(f"CALLING {base_url} FOR {model_name}")
- response = requests.post(base_url + "/v1/detokenize",
- json={
- "model": model_name,
- "tokens": tokens
- })
- response.raise_for_status()
- assert response.json() == {"prompt": prompt}
|