test_bart.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. """Compare the outputs of HF and Aphrodite for BART models using greedy
  2. sampling.
  3. Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
  4. """
  5. from typing import List, Optional, Tuple, Type
  6. from aphrodite.common.utils import is_cpu
  7. if not is_cpu():
  8. # CPU backend is not currently supported with encoder/decoder models
  9. # skip test definitions entirely to avoid importing GPU kernel libs
  10. # (xFormers, etc.)
  11. import pytest
  12. from transformers import AutoModelForSeq2SeqLM
  13. from aphrodite.common.sequence import SampleLogprobs
  14. from ....conftest import (AphroditeRunner, DecoderPromptType,
  15. ExplicitEncoderDecoderPrompt, HfRunner)
  16. from ....utils import multi_gpu_test
  17. from ...utils import check_logprobs_close
  18. MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"]
  19. def aphrodite_to_hf_output(
  20. aphrodite_output: Tuple[List[int], str, Optional[SampleLogprobs]],
  21. decoder_prompt_type: DecoderPromptType,
  22. ):
  23. """Sanitize aphrodite output to be comparable with hf output."""
  24. output_ids, output_str, out_logprobs = aphrodite_output
  25. hf_output_str = output_str + "</s>"
  26. if decoder_prompt_type == DecoderPromptType.NONE:
  27. hf_output_str = "<s>" + hf_output_str
  28. return output_ids, hf_output_str, out_logprobs
  29. def run_test(
  30. hf_runner: Type[HfRunner],
  31. aphrodite_runner: Type[AphroditeRunner],
  32. prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
  33. decoder_prompt_type: DecoderPromptType,
  34. model: str,
  35. *,
  36. dtype: str,
  37. max_tokens: int,
  38. num_logprobs: int,
  39. tensor_parallel_size: int,
  40. distributed_executor_backend: Optional[str] = None,
  41. ) -> None:
  42. '''
  43. Test the Aphrodite BART model for a variety of encoder/decoder input
  44. prompts, by validating it against HuggingFace (HF) BART.
  45. Arguments:
  46. * hf_runner: HuggingFace (HF) test model runner
  47. * aphrodite_runner: Aphrodite test model runner
  48. * example_encoder_decoder_prompts: test fixture which provides a
  49. dictionary of dummy prompts
  50. * model: the HF ID of the specific BART variant under test
  51. * dtype: the tensor datatype to employ
  52. * max_tokens
  53. * num_logprobs
  54. * decoder_prompt_type: key into the example_encoder_decoder_prompts
  55. dictionary; selects specific encoder/decoder
  56. prompt scenarios to test
  57. A note on using HF BART as a baseline for validating Aphrodite BART,
  58. specifically when the decoder prompt is None.
  59. The HF GenerationMixin's default behavior is to force the first
  60. decoded token to be <BOS> if the prompt does not already contain
  61. <BOS> (this is accomplished using a logit
  62. processor setting.)
  63. So when we use HF BART as our baseline for comparison, note that
  64. when the user provides a request with a None decoder prompt
  65. (i.e. a singleton encoder prompt, or else an explicit encoder/
  66. decoder prompt with the decoder sub-prompt set to None), HF and
  67. Aphrodite handle this in different ways:
  68. * HF will (1) tokenize the None prompt as an empty token-list,
  69. (2) append <decoder-start-token> to the beginning, yielding
  70. [<decoder-start-token>], (3) pass this token list to the model, and
  71. then (4) after computing logits during prefill, override the model
  72. logits & force <BOS> to be the first generated token.
  73. * Aphrodite will (1) tokenize the None prompt as [<BOS>], (2) append
  74. <decoder-start-token> to the beginning, yielding
  75. [<decoder-start-token><BOS>], (3) pass these tokens to the model &
  76. proceed with generation.
  77. The net effect is that compared to Aphrodite, the list of HF *decoded*
  78. tokens will contain one more initial <BOS> than the Aphrodite generated
  79. tokens, because Aphrodite's <BOS> token is injected into the prompt
  80. rather than into the generated output. This is in spite of the fact
  81. that overall, the complete sequences (prompt + decoded tokens) produced
  82. by Aphrodite will match HF.
  83. So when we use HF decoded token output to validate Aphrodite's decoded
  84. token output, the testing process must account for the difference in
  85. decoded token sequences between Aphrodite and HF specifically in the
  86. decoder-prompt-is-None case.
  87. One option is to disable the logit processor feature that forces the
  88. <BOS> token to be decoded (forced_bos_token_id = None), eliminating
  89. the problem entirely. However this is not "normal" BART usage.
  90. The other option is - only in the decoder-prompt-is-None case - to
  91. discard the first decoded token from the HF output before comparing it
  92. to Aphrodite.
  93. To that end, when testing the scenario where the decoder prompt is None
  94. (and only in that one scenario), this test skips the first HF decoded
  95. token during the process of validating the Aphrodite decoded output.
  96. '''
  97. # NOTE: take care of the order. run Aphrodite first, and then run HF.
  98. # Aphrodite needs a fresh new process without cuda initialization.
  99. # if we run HF first, the cuda initialization will be done and it
  100. # will hurt multiprocessing backend with fork method (the default).
  101. # Note: currently encoder/decoder models are only compatible with
  102. # enforce_eager=True. Normally this is not a problem because
  103. # for encoder/decoder models Aphrodite will
  104. # default to enforce_eager=True if enforce_eager
  105. # is left unspecified. However, the
  106. # AphroditeRunner test fixture (which wraps around the LLM class)
  107. # defaults to enforce_eager=False (a behavior which a number of
  108. # already-exisitng decoder-only unit tests expect), so when testing
  109. # an encoder/decoder model we must explicitly specify enforce_eager=True
  110. # in the AphroditeRunner constructor.
  111. with aphrodite_runner(
  112. model,
  113. dtype=dtype,
  114. tensor_parallel_size=tensor_parallel_size,
  115. distributed_executor_backend=distributed_executor_backend,
  116. enforce_eager=True) as aphrodite_model:
  117. aphrodite_outputs = (
  118. aphrodite_model.generate_encoder_decoder_greedy_logprobs(
  119. prompts, max_tokens, num_logprobs)
  120. )
  121. # Configuration settings for HF baseline
  122. hf_kwargs = {
  123. "top_k": None,
  124. "num_beams": 1,
  125. "repetition_penalty": 1.0,
  126. "top_p": 1.0,
  127. "length_penalty": 1.0,
  128. "early_stopping": False,
  129. "no_repeat_ngram_size": None,
  130. "min_length": 0
  131. }
  132. with hf_runner(model, dtype=dtype,
  133. auto_cls=AutoModelForSeq2SeqLM) as hf_model:
  134. hf_outputs = (
  135. hf_model.generate_encoder_decoder_greedy_logprobs_limit(
  136. prompts,
  137. max_tokens,
  138. num_logprobs,
  139. **hf_kwargs,
  140. ))
  141. hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE
  142. else 0)
  143. check_logprobs_close(
  144. outputs_0_lst=hf_outputs,
  145. outputs_1_lst=[
  146. aphrodite_to_hf_output(aphrodite_output, decoder_prompt_type)
  147. for aphrodite_output in aphrodite_outputs
  148. ],
  149. name_0="hf",
  150. name_1="aphrodite",
  151. num_outputs_0_skip_tokens=hf_skip_tokens,
  152. )
  153. @pytest.mark.parametrize("model", MODELS)
  154. @pytest.mark.parametrize("dtype", ["float", "bfloat16"])
  155. @pytest.mark.parametrize("max_tokens", [64])
  156. @pytest.mark.parametrize("num_logprobs", [5])
  157. @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
  158. def test_models(hf_runner, aphrodite_runner,
  159. example_encoder_decoder_prompts,
  160. model, dtype, max_tokens, num_logprobs,
  161. decoder_prompt_type) -> None:
  162. run_test(
  163. hf_runner,
  164. aphrodite_runner,
  165. example_encoder_decoder_prompts[decoder_prompt_type],
  166. decoder_prompt_type,
  167. model,
  168. dtype=dtype,
  169. max_tokens=max_tokens,
  170. num_logprobs=num_logprobs,
  171. tensor_parallel_size=1,
  172. )
  173. @multi_gpu_test(num_gpus=2)
  174. @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
  175. @pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
  176. @pytest.mark.parametrize("dtype", ["float"])
  177. @pytest.mark.parametrize("max_tokens", [64])
  178. @pytest.mark.parametrize("num_logprobs", [5])
  179. @pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM])
  180. def test_models_distributed(hf_runner, aphrodite_runner,
  181. example_encoder_decoder_prompts,
  182. distributed_executor_backend, model, dtype,
  183. max_tokens, num_logprobs,
  184. decoder_prompt_type) -> None:
  185. run_test(
  186. hf_runner,
  187. aphrodite_runner,
  188. example_encoder_decoder_prompts[decoder_prompt_type],
  189. decoder_prompt_type,
  190. model,
  191. dtype=dtype,
  192. max_tokens=max_tokens,
  193. num_logprobs=num_logprobs,
  194. tensor_parallel_size=2,
  195. distributed_executor_backend=distributed_executor_backend,
  196. )