test_gguf.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. """
  2. Tests gguf models against unquantized models generations
  3. Note: To pass the test, quantization higher than Q4 should be used
  4. """
  5. import os
  6. import pytest
  7. from huggingface_hub import hf_hub_download
  8. from transformers import AutoTokenizer
  9. from tests.quantization.utils import is_quant_method_supported
  10. from ...utils import check_logprobs_close
  11. os.environ["TOKENIZERS_PARALLELISM"] = "true"
  12. MAX_MODEL_LEN = 1024
  13. # FIXME: Move this to confest
  14. MODELS = [
  15. ("TinyLlama/TinyLlama-1.1B-Chat-v1.0",
  16. hf_hub_download("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
  17. filename="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf")),
  18. ("TinyLlama/TinyLlama-1.1B-Chat-v1.0",
  19. hf_hub_download("duyntnet/TinyLlama-1.1B-Chat-v1.0-imatrix-GGUF",
  20. filename="TinyLlama-1.1B-Chat-v1.0-IQ4_XS.gguf")),
  21. ("Qwen/Qwen2-1.5B-Instruct",
  22. hf_hub_download("Qwen/Qwen2-1.5B-Instruct-GGUF",
  23. filename="qwen2-1_5b-instruct-q4_k_m.gguf")),
  24. ("Qwen/Qwen2-1.5B-Instruct",
  25. hf_hub_download("legraphista/Qwen2-1.5B-Instruct-IMat-GGUF",
  26. filename="Qwen2-1.5B-Instruct.IQ4_XS.gguf")),
  27. ]
  28. @pytest.mark.skipif(not is_quant_method_supported("gguf"),
  29. reason="gguf is not supported on this GPU type.")
  30. @pytest.mark.parametrize("model", MODELS)
  31. @pytest.mark.parametrize("dtype", ["half"])
  32. @pytest.mark.parametrize("max_tokens", [32])
  33. @pytest.mark.parametrize("num_logprobs", [5])
  34. @pytest.mark.parametrize("tp_size", [1, 2])
  35. def test_models(
  36. num_gpus_available,
  37. aphrodite_runner,
  38. example_prompts,
  39. model,
  40. dtype: str,
  41. max_tokens: int,
  42. num_logprobs: int,
  43. tp_size: int,
  44. ) -> None:
  45. if num_gpus_available < tp_size:
  46. pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
  47. original_model, gguf_model = model
  48. tokenizer = AutoTokenizer.from_pretrained(original_model)
  49. messages = [[{
  50. 'role': 'user',
  51. 'content': prompt
  52. }] for prompt in example_prompts]
  53. example_prompts = tokenizer.apply_chat_template(messages,
  54. tokenize=False,
  55. add_generation_prompt=True)
  56. # Run unquantized model.
  57. with aphrodite_runner(model_name=original_model,
  58. dtype=dtype,
  59. max_model_len=MAX_MODEL_LEN,
  60. tensor_parallel_size=tp_size) as original_model:
  61. original_outputs = original_model.generate_greedy_logprobs(
  62. example_prompts[:-1], max_tokens, num_logprobs)
  63. # Run gguf model.
  64. with aphrodite_runner(model_name=gguf_model,
  65. dtype=dtype,
  66. max_model_len=MAX_MODEL_LEN,
  67. tensor_parallel_size=tp_size) as gguf_model:
  68. gguf_outputs = gguf_model.generate_greedy_logprobs(
  69. example_prompts[:-1], max_tokens, num_logprobs)
  70. check_logprobs_close(
  71. outputs_0_lst=original_outputs,
  72. outputs_1_lst=gguf_outputs,
  73. name_0="original",
  74. name_1="gguf",
  75. )