123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990 |
- """
- Tests gguf models against unquantized models generations
- Note: To pass the test, quantization higher than Q4 should be used
- """
- import os
- import pytest
- from huggingface_hub import hf_hub_download
- from transformers import AutoTokenizer
- from tests.quantization.utils import is_quant_method_supported
- from ...utils import check_logprobs_close
- os.environ["TOKENIZERS_PARALLELISM"] = "true"
- MAX_MODEL_LEN = 1024
- # FIXME: Move this to confest
- MODELS = [
- ("TinyLlama/TinyLlama-1.1B-Chat-v1.0",
- hf_hub_download("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
- filename="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf")),
- ("TinyLlama/TinyLlama-1.1B-Chat-v1.0",
- hf_hub_download("duyntnet/TinyLlama-1.1B-Chat-v1.0-imatrix-GGUF",
- filename="TinyLlama-1.1B-Chat-v1.0-IQ4_XS.gguf")),
- ("Qwen/Qwen2-1.5B-Instruct",
- hf_hub_download("Qwen/Qwen2-1.5B-Instruct-GGUF",
- filename="qwen2-1_5b-instruct-q4_k_m.gguf")),
- ("Qwen/Qwen2-1.5B-Instruct",
- hf_hub_download("legraphista/Qwen2-1.5B-Instruct-IMat-GGUF",
- filename="Qwen2-1.5B-Instruct.IQ4_XS.gguf")),
- ]
- @pytest.mark.skipif(not is_quant_method_supported("gguf"),
- reason="gguf is not supported on this GPU type.")
- @pytest.mark.parametrize("model", MODELS)
- @pytest.mark.parametrize("dtype", ["half"])
- @pytest.mark.parametrize("max_tokens", [32])
- @pytest.mark.parametrize("num_logprobs", [5])
- @pytest.mark.parametrize("tp_size", [1, 2])
- def test_models(
- num_gpus_available,
- aphrodite_runner,
- example_prompts,
- model,
- dtype: str,
- max_tokens: int,
- num_logprobs: int,
- tp_size: int,
- ) -> None:
- if num_gpus_available < tp_size:
- pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
- original_model, gguf_model = model
- tokenizer = AutoTokenizer.from_pretrained(original_model)
- messages = [[{
- 'role': 'user',
- 'content': prompt
- }] for prompt in example_prompts]
- example_prompts = tokenizer.apply_chat_template(messages,
- tokenize=False,
- add_generation_prompt=True)
- # Run unquantized model.
- with aphrodite_runner(model_name=original_model,
- dtype=dtype,
- max_model_len=MAX_MODEL_LEN,
- tensor_parallel_size=tp_size) as original_model:
- original_outputs = original_model.generate_greedy_logprobs(
- example_prompts[:-1], max_tokens, num_logprobs)
- # Run gguf model.
- with aphrodite_runner(model_name=gguf_model,
- dtype=dtype,
- max_model_len=MAX_MODEL_LEN,
- tensor_parallel_size=tp_size) as gguf_model:
- gguf_outputs = gguf_model.generate_greedy_logprobs(
- example_prompts[:-1], max_tokens, num_logprobs)
- check_logprobs_close(
- outputs_0_lst=original_outputs,
- outputs_1_lst=gguf_outputs,
- name_0="original",
- name_1="gguf",
- )
|