import openai # use the official client for correctness check import pytest # downloading lora to test lora requests from huggingface_hub import snapshot_download from ...utils import RemoteOpenAIServer # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" # technically this needs Mistral-7B-v0.1 as base, but we're not testing # generation quality here LORA_NAME = "typeof/zephyr-7b-beta-lora" @pytest.fixture(scope="module") def zephyr_lora_files(): return snapshot_download(repo_id=LORA_NAME) @pytest.fixture(scope="module") def server(zephyr_lora_files): args = [ # use half precision for speed and memory savings in CI environment "--dtype", "bfloat16", "--max-model-len", "8192", "--enforce-eager", # lora config below "--enable-lora", "--lora-modules", f"zephyr-lora={zephyr_lora_files}", f"zephyr-lora2={zephyr_lora_files}", "--max-lora-rank", "64", "--max-cpu-loras", "2", "--max-num-seqs", "128", ] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server @pytest.fixture(scope="module") def client(server): return server.get_async_client() @pytest.mark.asyncio async def test_check_models(client: openai.AsyncOpenAI): models = await client.models.list() models = models.data served_model = models[0] lora_models = models[1:] assert served_model.id == MODEL_NAME assert all(model.root == MODEL_NAME for model in models) assert lora_models[0].id == "zephyr-lora" assert lora_models[1].id == "zephyr-lora2"