test_gguf.py 2.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. """
  2. Tests gguf models against unquantized models generations
  3. Note: To pass the test, quantization higher than Q4 should be used
  4. """
  5. import os
  6. import pytest
  7. from huggingface_hub import hf_hub_download
  8. from tests.quantization.utils import is_quant_method_supported
  9. from .utils import check_logprobs_close
  10. os.environ["TOKENIZERS_PARALLELISM"] = "true"
  11. MAX_MODEL_LEN = 1024
  12. # FIXME: Move this to confest
  13. MODELS = [
  14. ("TinyLlama/TinyLlama-1.1B-Chat-v1.0",
  15. hf_hub_download("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
  16. filename="tinyllama-1.1b-chat-v1.0.Q4_0.gguf")),
  17. ("TinyLlama/TinyLlama-1.1B-Chat-v1.0",
  18. hf_hub_download("duyntnet/TinyLlama-1.1B-Chat-v1.0-imatrix-GGUF",
  19. filename="TinyLlama-1.1B-Chat-v1.0-IQ4_XS.gguf")),
  20. ("Qwen/Qwen2-1.5B-Instruct",
  21. hf_hub_download("Qwen/Qwen2-1.5B-Instruct-GGUF",
  22. filename="qwen2-1_5b-instruct-q4_k_m.gguf")),
  23. ("Qwen/Qwen2-1.5B-Instruct",
  24. hf_hub_download("legraphista/Qwen2-1.5B-Instruct-IMat-GGUF",
  25. filename="Qwen2-1.5B-Instruct.IQ4_XS.gguf")),
  26. ]
  27. @pytest.mark.skipif(not is_quant_method_supported("gguf"),
  28. reason="gguf is not supported on this GPU type.")
  29. @pytest.mark.parametrize("model", MODELS)
  30. @pytest.mark.parametrize("dtype", ["half"])
  31. @pytest.mark.parametrize("max_tokens", [32])
  32. @pytest.mark.parametrize("num_logprobs", [5])
  33. def test_models(
  34. aphrodite_runner,
  35. example_prompts,
  36. model,
  37. dtype: str,
  38. max_tokens: int,
  39. num_logprobs: int,
  40. ) -> None:
  41. original_model, gguf_model = model
  42. # Run unquantized model.
  43. with aphrodite_runner(model_name=original_model,
  44. dtype=dtype,
  45. max_model_len=MAX_MODEL_LEN,
  46. enforce_eager=True,
  47. tensor_parallel_size=1) as original_model:
  48. original_outputs = original_model.generate_greedy_logprobs(
  49. example_prompts[:-1], max_tokens, num_logprobs)
  50. # Run gguf model.
  51. with aphrodite_runner(model_name=gguf_model,
  52. dtype=dtype,
  53. max_model_len=MAX_MODEL_LEN,
  54. enforce_eager=True,
  55. tensor_parallel_size=1) as gguf_model:
  56. gguf_outputs = gguf_model.generate_greedy_logprobs(
  57. example_prompts[:-1], max_tokens, num_logprobs)
  58. check_logprobs_close(
  59. outputs_0_lst=original_outputs,
  60. outputs_1_lst=gguf_outputs,
  61. name_0="original",
  62. name_1="gguf",
  63. )