utils.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242
  1. import warnings
  2. from typing import Dict, List, Optional, Sequence, Tuple, Union
  3. from aphrodite.common.sequence import Logprob, PromptLogprobs, SampleLogprobs
  4. TokensText = Tuple[List[int], str]
  5. def check_outputs_equal(
  6. *,
  7. outputs_0_lst: Sequence[TokensText],
  8. outputs_1_lst: Sequence[TokensText],
  9. name_0: str,
  10. name_1: str,
  11. ):
  12. """
  13. Compare the two sequences generated by different models,
  14. which should be equal.
  15. """
  16. assert len(outputs_0_lst) == len(outputs_1_lst)
  17. for prompt_idx, (outputs_0,
  18. outputs_1) in enumerate(zip(outputs_0_lst,
  19. outputs_1_lst)):
  20. output_ids_0, output_str_0 = outputs_0
  21. output_ids_1, output_str_1 = outputs_1
  22. # The text and token outputs should exactly match
  23. fail_msg = (f"Test{prompt_idx}:"
  24. f"\n{name_0}:\t{output_str_0!r}"
  25. f"\n{name_1}:\t{output_str_1!r}")
  26. assert output_str_0 == output_str_1, fail_msg
  27. assert output_ids_0 == output_ids_1, fail_msg
  28. # Representation of generated sequence as a tuple of
  29. # * Token ID list
  30. # * String
  31. # * List of top sample logprobs for each sampled token
  32. #
  33. # Assumes prompt logprobs were not requested.
  34. TokensTextLogprobs = Tuple[List[int], str, Optional[Union[List[Dict[int,
  35. float]],
  36. SampleLogprobs]]]
  37. # Allow for tokens to be represented as str's rather than IDs;
  38. # tuple of
  39. # * Token string representations list
  40. # * String
  41. # * Optional list of top sample logprobs for each sampled token
  42. #
  43. # Assumes prompt logprobs were not requested.
  44. TextTextLogprobs = Tuple[List[str], str, Optional[Union[List[Dict[str, float]],
  45. List[Dict[str,
  46. Logprob]]]]]
  47. # Representation of generated sequence as a tuple of
  48. # * Token ID list
  49. # * String
  50. # * Optional list of top sample logprobs for each sampled token
  51. # * Optional list of top prompt logprobs for each prompt token
  52. #
  53. # Allows prompt logprobs to be requested.
  54. TokensTextLogprobsPromptLogprobs = Tuple[
  55. List[int], str, Optional[Union[List[Dict[int, float]], SampleLogprobs]],
  56. Optional[Union[List[Optional[Dict[int, float]]], PromptLogprobs]]]
  57. def check_logprobs_close(
  58. *,
  59. outputs_0_lst: Sequence[Union[TokensTextLogprobs,
  60. TokensTextLogprobsPromptLogprobs,
  61. TextTextLogprobs]],
  62. outputs_1_lst: Sequence[Union[TokensTextLogprobs,
  63. TokensTextLogprobsPromptLogprobs,
  64. TextTextLogprobs]],
  65. name_0: str,
  66. name_1: str,
  67. num_outputs_0_skip_tokens: int = 0,
  68. warn_on_mismatch: bool = True,
  69. always_check_logprobs: bool = False,
  70. ) -> None:
  71. """Compare the logprobs of two sequences generated by different models,
  72. which should be similar but not necessarily equal.
  73. How sample logprobs are compared:
  74. * `always_check_logprobs == True`: set of highest-logprob token ids
  75. must match between seq0 and seq1 at all sampled token offsets
  76. * `always_check_logprobs == False`: highest-logprob token ids are
  77. only compared at sampled token offsets for which generated token
  78. ids don't match
  79. Prompt logprobs must be provided either for both input sequences, or
  80. for neither. If prompt logprobs are provided, then highest-logprob
  81. prompt token ids must match between seq0 and seq1 at all prompt token
  82. offsets.
  83. Args:
  84. outputs_0_lst: First sequence to compare
  85. outputs_0_lst: Second sequence to compare
  86. name_0: sequence #0 name
  87. name_1: sequence #1 name
  88. num_outputs_0_skip_tokens: If > 0, specifies the number of initial
  89. sequence #0 tokens & logprobs to discard
  90. before comparison, i.e. all
  91. of sequence #1 will be compared to
  92. sequence #0 beginning at index
  93. num_outputs_0_skip_tokens
  94. warn_on_mismatch: Issue a warning if there is token-wise or text-wise
  95. mismatch between the two sequences
  96. always_check_logprobs: If true, check logprobs even when tokens match
  97. """
  98. assert len(outputs_0_lst) == len(outputs_1_lst)
  99. # Loop through responses to each prompt.
  100. for prompt_idx, (outputs_0,
  101. outputs_1) in enumerate(zip(outputs_0_lst,
  102. outputs_1_lst)):
  103. assert len(outputs_0) == len(outputs_1)
  104. if len(outputs_0) == 3:
  105. assert len(outputs_1) == 3
  106. # Break out tokens, text & sample logprobs
  107. # (prompt logprobs were not provided)
  108. output_ids_0, output_str_0, logprobs_0 = outputs_0
  109. output_ids_1, output_str_1, logprobs_1 = outputs_1
  110. elif len(outputs_0) == 4:
  111. assert len(outputs_1) == 4
  112. # Break out tokens, text, sample logprobs & prompt logprobs
  113. (
  114. output_ids_0,
  115. output_str_0,
  116. logprobs_0,
  117. prompt_logprobs_0,
  118. ) = outputs_0
  119. (
  120. output_ids_1,
  121. output_str_1,
  122. logprobs_1,
  123. prompt_logprobs_1,
  124. ) = outputs_1
  125. # Test prompt logprobs closeness
  126. if (prompt_logprobs_0 is not None
  127. and prompt_logprobs_1 is not None):
  128. # Both sequences' prompt logprobs lists are not `None``
  129. # (although individual list elements may be `None`);
  130. # for each token's logprobs:
  131. for idx, (logprobs_elem_0, logprobs_elem_1) in enumerate(
  132. zip(prompt_logprobs_0, prompt_logprobs_1)):
  133. fail_msg = (
  134. f"Prompt logprobs test:"
  135. f"\n{name_0}:\tPrompt index {idx}\t{logprobs_elem_0}"
  136. f"\n{name_1}:\tPrompt index {idx}\t{logprobs_elem_1}")
  137. if logprobs_elem_0 is None:
  138. # If the seq 0 token's logprobs are `None`,
  139. # the seq 1 token's logprobs must be `None`
  140. assert logprobs_elem_1 is None, fail_msg
  141. else:
  142. # If the seq 0 token's logprobs are not `None`,
  143. # the seq 1 token's logprobs must not be `None`
  144. assert logprobs_elem_1 is not None, fail_msg
  145. # Logprobs check: top-k token choices must be the same
  146. assert (set(logprobs_elem_0.keys()) == set(
  147. logprobs_elem_1.keys())), fail_msg
  148. else:
  149. # Both sequence logprobs lists must be `None`
  150. fail_msg = (f"Prompt logprobs test:"
  151. f"\n{name_0}:\tlogprobs\t{prompt_logprobs_0}"
  152. f"\n{name_1}:\tlogprobs\t{prompt_logprobs_1}")
  153. assert (prompt_logprobs_0 is None
  154. and prompt_logprobs_1 is None), fail_msg
  155. else:
  156. raise ValueError(f"Outputs tuple must have 3 or 4 elements but "
  157. f"{len(outputs_0)} elements were provided: "
  158. f"{outputs_0}")
  159. if logprobs_0 is None:
  160. logprobs_0 = [None] * len(output_ids_0)
  161. if logprobs_1 is None:
  162. logprobs_1 = [None] * len(output_ids_1)
  163. # Skip specified number of initial sequence #0 tokens
  164. # & logprobs, leaving output text as-is for simplicity
  165. # (text mismatches may generate warnings but do not
  166. # cause the test to fail.)
  167. if num_outputs_0_skip_tokens < 0:
  168. raise ValueError("num_outputs_0_skip_tokens must be non-negative")
  169. output_ids_0 = output_ids_0[num_outputs_0_skip_tokens:]
  170. logprobs_0 = logprobs_0[num_outputs_0_skip_tokens:]
  171. # Loop through generated tokens.
  172. for idx, (output_id_0,
  173. output_id_1) in enumerate(zip(output_ids_0, output_ids_1)):
  174. is_tok_mismatch = output_id_0 != output_id_1
  175. # If generated tokens don't match
  176. # or it is desired to always check logprobs,
  177. # then
  178. if is_tok_mismatch or always_check_logprobs:
  179. logprobs_elem_0 = logprobs_0[idx]
  180. logprobs_elem_1 = logprobs_1[idx]
  181. # Each predicted token must be in top N logprobs of the other
  182. fail_msg = (
  183. f"Test{prompt_idx}:"
  184. f"\nMatched tokens:\t{output_ids_0[:idx]}"
  185. f"\n{name_0}:\t{output_str_0!r}\t{logprobs_elem_0}"
  186. f"\n{name_1}:\t{output_str_1!r}\t{logprobs_elem_1}")
  187. assert logprobs_elem_0 is not None, fail_msg
  188. assert logprobs_elem_1 is not None, fail_msg
  189. assert output_id_0 in logprobs_elem_1, fail_msg
  190. assert output_id_1 in logprobs_elem_0, fail_msg
  191. if warn_on_mismatch and is_tok_mismatch:
  192. with warnings.catch_warnings():
  193. # This ensures that repeated warnings are shown
  194. # in the output, not just the first occurrence
  195. warnings.simplefilter("always")
  196. warnings.warn(fail_msg, stacklevel=2)
  197. # Break out since sequences will now diverge.
  198. break
  199. else:
  200. if output_str_0 != output_str_1 and warn_on_mismatch:
  201. # The token outputs exactly match,
  202. # so the text outputs should exactly match as well
  203. fail_msg = (f"Test{prompt_idx}:"
  204. f"\n{name_0}:\t{output_str_0!r}"
  205. f"\n{name_1}:\t{output_str_1!r}")
  206. with warnings.catch_warnings():
  207. # This ensures that repeated warnings are shown
  208. # in the output, not just the first occurrence
  209. warnings.simplefilter("always")
  210. warnings.warn(fail_msg, stacklevel=2)