test_long_context.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295
  1. import ast
  2. from typing import List, Optional, Tuple
  3. import numpy as np
  4. import pytest
  5. import aphrodite
  6. from aphrodite import SamplingParams
  7. from aphrodite.lora.layers import LinearScalingRotaryEmbeddingWithLora
  8. from aphrodite.lora.request import LoRARequest
  9. from aphrodite.modeling.layers.rotary_embedding import (
  10. LinearScalingRotaryEmbedding)
  11. from .data.long_context_test_data import prompts_and_responses
  12. context_len_to_scaling_factor = {
  13. "16k": 4,
  14. "32k": 8,
  15. }
  16. # We use the same sampling params for all requests
  17. sampling_params = SamplingParams(
  18. temperature=0,
  19. max_tokens=100,
  20. )
  21. def _create_lora_request(lora_id, long_context_infos):
  22. context_len = long_context_infos[lora_id]["context_length"]
  23. scaling_factor = context_len_to_scaling_factor[context_len]
  24. return LoRARequest(context_len, lora_id,
  25. long_context_infos[lora_id]["lora"], None,
  26. 4096 * scaling_factor)
  27. def evaluate_json_response(model_response, golden_response):
  28. """Evaluates the model response against the golden response.
  29. Returns a score between 0 and 1, where 1 is a perfect match and 0 is no
  30. match. The score quantifies how well the model is able to extract the
  31. golden JSON from the long context.
  32. """
  33. try:
  34. model_response = ast.literal_eval(model_response)
  35. except Exception as e:
  36. raise ValueError(
  37. f"Model response is not a valid JSON. Expected {golden_response}, "
  38. f"got {model_response}") from e
  39. # Normally, we would flatten the dictionary and compare the values, but in
  40. # this case, we know that the dictionary is only 2 levels deep
  41. positive_values = 0
  42. total_values = 0
  43. # We look at all the attributes of the person that we are extracting a
  44. # biography of and copmare them to the golden response
  45. for person_attribute, person_attribute_value in golden_response.items():
  46. if person_attribute in model_response:
  47. if isinstance(person_attribute_value, dict):
  48. for (sub_attribute,
  49. sub_attribute_value) in person_attribute_value.items():
  50. total_values += 1
  51. if sub_attribute in model_response[
  52. person_attribute] and model_response[
  53. person_attribute][
  54. sub_attribute] == sub_attribute_value:
  55. positive_values += 1
  56. else:
  57. total_values += 1
  58. if model_response[person_attribute] == person_attribute_value:
  59. positive_values += 1
  60. else:
  61. # We count a missing sub-dict as a single missed value.
  62. total_values += 1
  63. # Return a score between 0 and 1
  64. return positive_values / total_values
  65. def generate(
  66. llm: aphrodite.LLM,
  67. inputs: Tuple[str, SamplingParams, Optional[LoRARequest]],
  68. ):
  69. prompts, sampling_param, lora_request = inputs
  70. outputs = llm.generate(prompts, sampling_param, lora_request=lora_request)
  71. return outputs[0].outputs[0].text.strip()
  72. def batched_generate(
  73. llm: aphrodite.LLM,
  74. inputs: List[Tuple[str, SamplingParams, Optional[LoRARequest]]],
  75. ):
  76. for input in inputs:
  77. prompt, sampling_param, lora_req = input
  78. # Add requests to the engine and run the engine
  79. llm._validate_and_add_requests(prompt,
  80. sampling_param,
  81. lora_request=lora_req,
  82. prompt_adapter_request=None)
  83. outputs = llm._run_engine(use_tqdm=True)
  84. return [outputs[i].outputs[0].text.strip() for i in range(len(outputs))]
  85. @pytest.fixture(scope="module")
  86. def lora_llm(long_context_infos):
  87. scaling_factors = [
  88. context_len_to_scaling_factor[info["context_length"]]
  89. for info in long_context_infos.values()
  90. ]
  91. llm = aphrodite.LLM("meta-llama/Llama-2-13b-chat-hf",
  92. enable_lora=True,
  93. max_num_seqs=16,
  94. max_loras=2,
  95. long_lora_scaling_factors=tuple(scaling_factors),
  96. max_num_batched_tokens=4096 * 8,
  97. tensor_parallel_size=4,
  98. distributed_executor_backend="mp")
  99. yield llm
  100. del llm
  101. def test_rotary_emb_replaced(dist_init):
  102. """Verify rotary emb in all the layers are replaced"""
  103. from aphrodite.engine.args_tools import EngineArgs
  104. from aphrodite.worker.model_runner import ModelRunner
  105. engine_args = EngineArgs("meta-llama/Llama-2-7b-hf",
  106. long_lora_scaling_factors=(4.0, ),
  107. enable_lora=True)
  108. engine_config = engine_args.create_engine_config()
  109. model_runner = ModelRunner(
  110. model_config=engine_config.model_config,
  111. parallel_config=engine_config.parallel_config,
  112. scheduler_config=engine_config.scheduler_config,
  113. device_config=engine_config.device_config,
  114. cache_config=engine_config.cache_config,
  115. load_config=engine_config.load_config,
  116. lora_config=engine_config.lora_config,
  117. is_driver_worker=True,
  118. )
  119. model_runner.load_model()
  120. rotary_emb_count = 0
  121. for module_name, module in model_runner.model.named_modules(
  122. remove_duplicate=False):
  123. if "rotary_emb" in module_name:
  124. if "base_layer" not in module_name:
  125. rotary_emb_count += 1
  126. assert isinstance(module, LinearScalingRotaryEmbeddingWithLora)
  127. else:
  128. assert isinstance(module, LinearScalingRotaryEmbedding)
  129. # Llama 2 has 32 layers.
  130. assert rotary_emb_count == 32
  131. @pytest.mark.skip_global_cleanup
  132. def test_batched_rope_kernel(lora_llm, long_context_infos):
  133. """We test the batched kernel by comparing the results of batched an
  134. non-batched generation.
  135. """
  136. # Create non batched results first to compare against batched results
  137. non_batched_results: List[str] = []
  138. for lora_id, info in long_context_infos.items():
  139. context_len = info["context_length"]
  140. lora_prompt = (prompts_and_responses[context_len][0]["prompt"],
  141. sampling_params,
  142. _create_lora_request(lora_id, long_context_infos))
  143. lora_output = generate(lora_llm, lora_prompt)
  144. non_batched_results.append(lora_output)
  145. # Create batched results
  146. # Each element of the batch must be
  147. # (prompt, prompt_sampling_params, prompt_lora_request)
  148. batched_prompts: List[Tuple[str, SamplingParams,
  149. Optional[LoRARequest]]] = []
  150. for lora_id, info in long_context_infos.items():
  151. context_len = info["context_length"]
  152. batched_prompts.extend([
  153. (prompts_and_responses[context_len][0]["prompt"], sampling_params,
  154. _create_lora_request(lora_id, long_context_infos))
  155. ])
  156. batched_results = batched_generate(lora_llm, batched_prompts)
  157. # Results should be the same
  158. for non_batched, batched in zip(non_batched_results, batched_results):
  159. assert non_batched == batched, (
  160. "Non batched and batched results should be the "
  161. f"same:\n{batched}\n{non_batched}")
  162. @pytest.mark.skip_global_cleanup
  163. def test_self_consistency(lora_llm, long_context_infos):
  164. """We test consistency of the batched kernel by permuting batched
  165. inputs and comparing the results to the non-permuted batched results.
  166. """
  167. num_loras = len(long_context_infos)
  168. # Create results in order of long_context_infos
  169. batched_prompts: List[Tuple[str, SamplingParams,
  170. Optional[LoRARequest]]] = []
  171. for lora_id, info in long_context_infos.items():
  172. context_len = info["context_length"]
  173. batched_prompts.extend([
  174. (prompts_and_responses[context_len][0]["prompt"], sampling_params,
  175. _create_lora_request(lora_id, long_context_infos))
  176. ])
  177. batched_results = batched_generate(lora_llm, batched_prompts)
  178. permutation = np.random.default_rng(seed=42).permutation(num_loras)
  179. # Create results in random order of permutation
  180. batched_prompts = []
  181. for i in permutation:
  182. lora_id, info = list(long_context_infos.items())[i]
  183. context_len = info["context_length"]
  184. batched_prompts.extend([
  185. (prompts_and_responses[context_len][0]["prompt"], sampling_params,
  186. _create_lora_request(lora_id, long_context_infos))
  187. ])
  188. permutated_batched_results = batched_generate(lora_llm, batched_prompts)
  189. # Results should be the same
  190. for i in range(num_loras):
  191. assert batched_results[i] == permutated_batched_results[
  192. permutation[i]], (
  193. f"Results should be the same:\n{batched_results[i]}"
  194. f"\n{permutated_batched_results[permutation[i]]}")
  195. @pytest.mark.skip_global_cleanup
  196. def test_quality(lora_llm, long_context_infos):
  197. """We test the quality of the answers given by the LoRA model by
  198. comparing the generated text to the merged model's outputs.
  199. This is effectively a mini-benchmark over four prompts.
  200. If this test fails, this indicates that the quality of the LoRA model
  201. is suboptimal compared to the merged model. For example, if the model
  202. does not output valid dictionaries, this test will fail.
  203. If needed for testing, the merged versions of the models are available
  204. as part of the `conftest`.
  205. The test is expected to run for about 1 minute on a p4de.24xlarge
  206. instance.
  207. """
  208. scores: List[float] = []
  209. for lora_id, info in long_context_infos.items():
  210. context_len = info["context_length"]
  211. for prompt_and_response in prompts_and_responses[context_len]:
  212. lora_prompt = (prompt_and_response["prompt"], sampling_params,
  213. _create_lora_request(lora_id, long_context_infos))
  214. response = generate(lora_llm, lora_prompt)
  215. golden_answer = prompt_and_response["golden_answer"]
  216. score = evaluate_json_response(response, golden_answer)
  217. scores.append(score)
  218. assert score > 0.3, ("Quality of the answer is not good enough. "
  219. f"Expected {golden_answer}, got {response}")
  220. assert np.mean(scores) > 0.5
  221. @pytest.mark.skip_global_cleanup
  222. def test_max_len(lora_llm, long_context_infos):
  223. """Test that we raise an ValueError when the input of a given LoRA
  224. model exceeds the maximum length."""
  225. # Since each LoRA model has a different maximum length, we need to
  226. # test each one separately
  227. for lora_id, info in long_context_infos.items():
  228. context_len = info["context_length"]
  229. lora_request = _create_lora_request(lora_id, long_context_infos)
  230. # Good prompt should be fine
  231. good_prompt = prompts_and_responses[context_len][0]["prompt"]
  232. generate(lora_llm, (good_prompt, sampling_params, lora_request))
  233. # Bad prompt should raise an error
  234. bad_prompt = good_prompt * 2
  235. with pytest.raises(ValueError):
  236. generate(lora_llm, (bad_prompt, sampling_params, lora_request))
  237. # Also test batched
  238. batched_prompts: List[Tuple[str, SamplingParams,
  239. Optional[LoRARequest]]] = []
  240. for lora_id_with_bad_inputs in long_context_infos:
  241. for lora_id, info in long_context_infos.items():
  242. context_len = info["context_length"]
  243. batched_prompts.extend([
  244. (prompts_and_responses[context_len][0]["prompt"] *
  245. (2 if lora_id == lora_id_with_bad_inputs else 1),
  246. sampling_params,
  247. _create_lora_request(lora_id, long_context_infos))
  248. ])
  249. # Turn good prompt into bad prompt inside of batched prompts
  250. with pytest.raises(ValueError):
  251. batched_generate(lora_llm, batched_prompts)