1
0

test_fp8.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. # flake8: noqa
  2. """Tests fp8 models against ground truth generation
  3. Note: these tests will only pass on L4 GPU.
  4. """
  5. import os
  6. from typing import List
  7. import pytest
  8. import torch
  9. from transformers import AutoTokenizer
  10. from aphrodite import LLM, SamplingParams
  11. from tests.quantization.utils import is_quant_method_supported
  12. os.environ["TOKENIZERS_PARALLELISM"] = "true"
  13. MAX_MODEL_LEN = 1024
  14. MODELS = [
  15. "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV",
  16. "meta-llama/Meta-Llama-3-8B-Instruct",
  17. ]
  18. EXPECTED_STRS_MAP = {
  19. "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV": {
  20. "auto": [
  21. 'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (',
  22. 'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
  23. 'Artificial intelligence (AI) and human intelligence (HI) process information in distinct ways, with both',
  24. 'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
  25. 'Zeta-5, a highly advanced robot designed for menial labor, whirred and beep',
  26. 'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
  27. 'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
  28. 'Here are the translations:\n\n**Japanese:** (Haya aki no tori, nemuri no'
  29. ],
  30. "fp8": [
  31. 'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
  32. 'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
  33. 'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
  34. 'A neural network is a complex system made up of several basic components that work together to enable it to',
  35. 'Zeta-5, a highly advanced robot designed for menial labor, had never experienced anything like',
  36. 'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. Here',
  37. 'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
  38. 'Here are the translations:\n\n**Japanese:** (Haya kotori wa mushi o tsuk'
  39. ]
  40. },
  41. "meta-llama/Meta-Llama-3-8B-Instruct": {
  42. "auto": [
  43. 'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
  44. 'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
  45. 'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
  46. 'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
  47. 'In the vast, sterile laboratory, Robot 3456-Alpha, or "Alpha" for short',
  48. 'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
  49. 'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
  50. 'Here are the translations:\n\n**Japanese:** (Haya aki wa mushi o tsukamu'
  51. ],
  52. "fp8": [
  53. 'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
  54. 'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
  55. 'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
  56. 'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
  57. 'In the year 2154, robotics engineer Dr. Rachel Kim had spent years perfecting her latest',
  58. 'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
  59. 'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
  60. 'Here are the translations:\n\n**Japanese:** (Haya tori, mushi o tsukamu'
  61. ]
  62. },
  63. }
  64. # This test compares against golden strings for exact match since
  65. # there is no baseline implementation to compare against
  66. # and is unstable w.r.t specifics of the fp8 implementation or
  67. # the hardware being run on.
  68. # Disabled to prevent it from breaking the build
  69. @pytest.mark.skip(
  70. reason=
  71. "Prevent unstable test based on golden strings from breaking the build.")
  72. @pytest.mark.skipif(not is_quant_method_supported("fp8"),
  73. reason="fp8 is not supported on this GPU type.")
  74. @pytest.mark.parametrize("model_name", MODELS)
  75. @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
  76. def test_models(example_prompts, model_name, kv_cache_dtype) -> None:
  77. model = LLM(model=model_name,
  78. max_model_len=MAX_MODEL_LEN,
  79. trust_remote_code=True,
  80. enforce_eager=True,
  81. quantization="fp8",
  82. kv_cache_dtype=kv_cache_dtype)
  83. tokenizer = AutoTokenizer.from_pretrained(model_name)
  84. formatted_prompts = [
  85. tokenizer.apply_chat_template([{
  86. "role": "user",
  87. "content": prompt
  88. }],
  89. tokenize=False,
  90. add_generation_prompt=True)
  91. for prompt in example_prompts
  92. ]
  93. params = SamplingParams(max_tokens=20, temperature=0)
  94. generations: List[str] = []
  95. # Note: these need to be run 1 at a time due to numerical precision,
  96. # since the expected strs were generated this way.
  97. for prompt in formatted_prompts:
  98. outputs = model.generate(prompt, params)
  99. generations.append(outputs[0].outputs[0].text)
  100. del model
  101. print(model_name, kv_cache_dtype, generations)
  102. expected_strs = EXPECTED_STRS_MAP[model_name][kv_cache_dtype]
  103. for i in range(len(example_prompts)):
  104. generated_str = generations[i]
  105. expected_str = expected_strs[i]
  106. assert expected_str == generated_str, (
  107. f"Test{i}:\nExpected: {expected_str!r}\nAphrodite: {generated_str!r}")