test_quant_model.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. # Adapted from
  2. # https://github.com/fmmoret/aphrodite/blob/fm-support-lora-on-quantized-models/tests/lora/test_llama.py
  3. from dataclasses import dataclass
  4. from typing import List
  5. import pytest
  6. import aphrodite
  7. from aphrodite.lora.request import LoRARequest
  8. from .conftest import cleanup
  9. @dataclass
  10. class ModelWithQuantization:
  11. model_path: str
  12. quantization: str
  13. MODELS: List[ModelWithQuantization] = [
  14. ModelWithQuantization(model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
  15. quantization="AWQ"),
  16. ModelWithQuantization(model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
  17. quantization="GPTQ"),
  18. ]
  19. def do_sample(llm: aphrodite.LLM,
  20. lora_path: str,
  21. lora_id: int,
  22. max_tokens: int = 256) -> List[str]:
  23. raw_prompts = [
  24. "Give me an orange-ish brown color",
  25. "Give me a neon pink color",
  26. ]
  27. def format_prompt_tuples(prompt):
  28. return f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
  29. prompts = [format_prompt_tuples(p) for p in raw_prompts]
  30. sampling_params = aphrodite.SamplingParams(temperature=0,
  31. max_tokens=max_tokens,
  32. stop=["<|im_end|>"])
  33. outputs = llm.generate(
  34. prompts,
  35. sampling_params,
  36. lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
  37. if lora_id else None)
  38. # Print the outputs.
  39. generated_texts: List[str] = []
  40. for output in outputs:
  41. prompt = output.prompt
  42. generated_text = output.outputs[0].text
  43. generated_texts.append(generated_text)
  44. print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
  45. return generated_texts
  46. @pytest.mark.parametrize("model", MODELS)
  47. @pytest.mark.parametrize("tp_size", [1])
  48. def test_quant_model_lora(tinyllama_lora_files, model, tp_size):
  49. # Cannot use as it will initialize torch.cuda too early...
  50. # if torch.cuda.device_count() < tp_size:
  51. # pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
  52. llm = aphrodite.LLM(
  53. model=model.model_path,
  54. enable_lora=True,
  55. max_num_seqs=16,
  56. max_loras=4,
  57. max_model_len=400,
  58. tensor_parallel_size=tp_size,
  59. gpu_memory_utilization=0.2, #avoid OOM
  60. quantization=model.quantization,
  61. trust_remote_code=True)
  62. if model.quantization is None:
  63. expected_no_lora_output = [
  64. "Here are some examples of orange-brown colors",
  65. "I'm sorry, I don't have"
  66. ]
  67. expected_lora_output = [
  68. "#ff8050",
  69. "#ff8080",
  70. ]
  71. elif model.quantization == "AWQ":
  72. expected_no_lora_output = [
  73. "I'm sorry, I don't understand",
  74. "I'm sorry, I don't understand",
  75. ]
  76. expected_lora_output = [
  77. "#f07700: A v",
  78. "#f00000: A v",
  79. ]
  80. elif model.quantization == "GPTQ":
  81. expected_no_lora_output = [
  82. "I'm sorry, I don't have",
  83. "I'm sorry, I don't have",
  84. ]
  85. expected_lora_output = [
  86. "#f08800: This is",
  87. "#f07788 \n#",
  88. ]
  89. def expect_match(output, expected_output):
  90. # HACK: GPTQ lora outputs are just incredibly unstable.
  91. # Assert that the outputs changed.
  92. if (model.quantization == "GPTQ"
  93. and expected_output is expected_lora_output):
  94. assert output != expected_no_lora_output
  95. for i, o in enumerate(output):
  96. assert o.startswith(
  97. '#'), f"Expected example {i} to start with # but got {o}"
  98. return
  99. assert output == expected_output
  100. max_tokens = 10
  101. print("lora adapter created")
  102. output = do_sample(llm,
  103. tinyllama_lora_files,
  104. lora_id=0,
  105. max_tokens=max_tokens)
  106. expect_match(output, expected_no_lora_output)
  107. print("lora 1")
  108. output = do_sample(llm,
  109. tinyllama_lora_files,
  110. lora_id=1,
  111. max_tokens=max_tokens)
  112. expect_match(output, expected_lora_output)
  113. print("no lora")
  114. output = do_sample(llm,
  115. tinyllama_lora_files,
  116. lora_id=0,
  117. max_tokens=max_tokens)
  118. expect_match(output, expected_no_lora_output)
  119. print("lora 2")
  120. output = do_sample(llm,
  121. tinyllama_lora_files,
  122. lora_id=2,
  123. max_tokens=max_tokens)
  124. expect_match(output, expected_lora_output)
  125. print("removing lora")
  126. del llm
  127. cleanup()
  128. @pytest.mark.parametrize("model", MODELS)
  129. @pytest.mark.skip("Requires multiple GPUs")
  130. def test_quant_model_tp_equality(tinyllama_lora_files, model):
  131. # Cannot use as it will initialize torch.cuda too early...
  132. # if torch.cuda.device_count() < 2:
  133. # pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
  134. llm_tp1 = aphrodite.LLM(
  135. model=model.model_path,
  136. enable_lora=True,
  137. max_num_seqs=16,
  138. max_loras=4,
  139. tensor_parallel_size=1,
  140. gpu_memory_utilization=0.2, #avoid OOM
  141. quantization=model.quantization,
  142. trust_remote_code=True)
  143. output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1)
  144. del llm_tp1
  145. cleanup()
  146. llm_tp2 = aphrodite.LLM(
  147. model=model.model_path,
  148. enable_lora=True,
  149. max_num_seqs=16,
  150. max_loras=4,
  151. tensor_parallel_size=2,
  152. gpu_memory_utilization=0.2, #avoid OOM
  153. quantization=model.quantization)
  154. output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1)
  155. del llm_tp2
  156. cleanup()
  157. assert output_tp1 == output_tp2