test_bart.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. """Compare the outputs of HF and Aphrodite for BART models using greedy
  2. sampling.
  3. Run `pytest tests/models/test_bart.py`.
  4. """
  5. from typing import List, Optional, Tuple
  6. from aphrodite.common.utils import is_cpu
  7. if not is_cpu():
  8. # CPU backend is not currently supported with encoder/decoder models
  9. # skip test definitions entirely to avoid importing GPU kernel libs
  10. # (xFormers, etc.)
  11. import pytest
  12. from aphrodite.common.sequence import SampleLogprobs
  13. from ..conftest import DecoderPromptType
  14. from .utils import check_logprobs_close
  15. MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"]
  16. def aphrodite_to_hf_output(
  17. aphrodite_output: Tuple[List[int], str, Optional[SampleLogprobs]],
  18. decoder_prompt_type: DecoderPromptType,
  19. ):
  20. """Sanitize aphrodite output to be comparable with hf output."""
  21. output_ids, output_str, out_logprobs = aphrodite_output
  22. hf_output_str = output_str + "</s>"
  23. if decoder_prompt_type == DecoderPromptType.NONE:
  24. hf_output_str = "<s>" + hf_output_str
  25. return output_ids, hf_output_str, out_logprobs
  26. @pytest.mark.parametrize("model", MODELS)
  27. @pytest.mark.parametrize("dtype", ["float", "bfloat16"])
  28. @pytest.mark.parametrize("max_tokens", [64])
  29. @pytest.mark.parametrize("num_logprobs", [5])
  30. @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
  31. def test_models(
  32. hf_runner,
  33. aphrodite_runner,
  34. example_encoder_decoder_prompts,
  35. model: str,
  36. dtype: str,
  37. max_tokens: int,
  38. num_logprobs: int,
  39. decoder_prompt_type: DecoderPromptType,
  40. ) -> None:
  41. '''
  42. Test the Aphrodite BART model for a variety of encoder/decoder
  43. input prompts, by validating it against HuggingFace (HF) BART.
  44. Arguments:
  45. * hf_runner: HuggingFace (HF) test model runner
  46. * aphrodite_runner: Aphrodite test model runner
  47. * example_encoder_decoder_prompts: test fixture which provides a
  48. dictionary of dummy prompts
  49. * model: the HF ID of the specific BART variant under test
  50. * dtype: the tensor datatype to employ
  51. * max_tokens
  52. * num_logprobs
  53. * decoder_prompt_type: key into the example_encoder_decoder_prompts
  54. dictionary; selects specific encoder/decoder
  55. prompt scenarios to test
  56. A note on using HF BART as a baseline for validating Aphrodite BART,
  57. specifically when the decoder prompt is None.
  58. The HF GenerationMixin's default behavior is to force the first
  59. decoded token to be <BOS> if the prompt does not already contain
  60. <BOS> (this is accomplished using a logit
  61. processor setting.)
  62. So when we use HF BART as our baseline for comparison, note that
  63. when the user provides a request with a None decoder prompt
  64. (i.e. a singleton encoder prompt, or else an explicit encoder/
  65. decoder prompt with the decoder sub-prompt set to None), HF and
  66. Aphrodite handle this in different ways:
  67. * HF will (1) tokenize the None prompt as an empty token-list,
  68. (2) append <decoder-start-token> to the beginning, yielding
  69. [<decoder-start-token>], (3) pass this token list to the model, and
  70. then (4) after computing logits during prefill, override the model
  71. logits & force <BOS> to be the first generated token.
  72. * Aphrodite will (1) tokenize the None prompt as [<BOS>], (2) append
  73. decoder-start-token to the beginning, yielding
  74. [<decoder-start-token><BOS>], (3) pass these tokens to the model &
  75. proceed with generation.
  76. The net effect is that compared to Aphrodite, the list of HF
  77. *decoded* tokens will contain one more initial <BOS> than the
  78. Aphrodite generated tokens, because Aphrodite's <BOS> token is
  79. injected into the prompt rather than into the generated output.
  80. This is in spite of the fact that overall, the complete sequences
  81. (prompt + decoded tokens) produced by Aphrodite will match HF.
  82. So when we use HF decoded token output to validate Aphrodite's decoded
  83. token output, the testing process must account for the difference in
  84. decoded token sequences between Aphrodite and HF specifically in the
  85. decoder-prompt-is-None case.
  86. One option is to disable the logit processor feature that forces the
  87. <BOS> token to be decoded (forced_bos_token_id = None), eliminating
  88. the problem entirely. However this is not "normal" BART usage.
  89. The other option is - only in the decoder-prompt-is-None case - to
  90. discard the first decoded token from the HF output before comparing it
  91. to Aphrodite.
  92. To that end, when testing the scenario where the decoder prompt is None
  93. (and only in that one scenario), this test skips the first HF decoded
  94. token during the process of validating the Aphrodite decoded output.
  95. '''
  96. test_case_prompts = example_encoder_decoder_prompts[
  97. decoder_prompt_type]
  98. # Configuration settings for HF baseline
  99. hf_kwargs = {
  100. "top_k": None,
  101. "num_beams": 1,
  102. "repetition_penalty": 1.0,
  103. "top_p": 1.0,
  104. "length_penalty": 1.0,
  105. "early_stopping": False,
  106. "no_repeat_ngram_size": None,
  107. "min_length": 0
  108. }
  109. with hf_runner(model, dtype=dtype,
  110. is_encoder_decoder_model=True) as hf_model:
  111. hf_outputs = (
  112. hf_model.generate_encoder_decoder_greedy_logprobs_limit(
  113. test_case_prompts,
  114. max_tokens,
  115. num_logprobs,
  116. **hf_kwargs,
  117. ))
  118. # Note: currently encoder/decoder models are only compatible with
  119. # enforce_eager=True. Normally this is not a problem because
  120. # for encoder/decoder models Aphrodite will
  121. # default to enforce_eager=True if enforce_eager
  122. # is left unspecified. However, the
  123. # AphroditeRunner test fixture (which wraps around the LLM class)
  124. # defaults to enforce_eager=False (a behavior which a number of
  125. # already-exisitng decoder-only unit tests expect), so when testing
  126. # an encoder/decoder model we must explicitly specify enforce_eager=True
  127. # in the AphroditeRunner constructor.
  128. with aphrodite_runner(model, dtype=dtype,
  129. enforce_eager=True) as aphrodite_model:
  130. aphrodite_outputs = (
  131. aphrodite_model.generate_encoder_decoder_greedy_logprobs(
  132. test_case_prompts, max_tokens, num_logprobs))
  133. hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE
  134. else 0)
  135. check_logprobs_close(
  136. outputs_0_lst=hf_outputs,
  137. outputs_1_lst=[
  138. aphrodite_to_hf_output(aphrodite_output, decoder_prompt_type)
  139. for aphrodite_output in aphrodite_outputs
  140. ],
  141. name_0="hf",
  142. name_1="aphrodite",
  143. num_outputs_0_skip_tokens=hf_skip_tokens,
  144. )