test_sampler.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. import gc
  2. from unittest.mock import patch
  3. import pytest
  4. import torch
  5. import triton
  6. import triton.language as tl
  7. from aphrodite.modeling.layers.ops.sample import (_sample_triton,
  8. _uniform_to_exponential,
  9. sample)
  10. from aphrodite.modeling.sampling_metadata import SamplingTensors
  11. from aphrodite.modeling.utils import set_random_seed
  12. from aphrodite.triton_utils.libentry import LibEntry
  13. from aphrodite.triton_utils.sample import (MAX_TRITON_N_COLS,
  14. get_num_triton_sampler_splits)
  15. SINGLE_SPLIT_VOCAB_SIZE = 32000 # llama/mistral/mixtral vocab size
  16. MULTI_SPLIT_VOCAB_SIZE = MAX_TRITON_N_COLS + 100
  17. @pytest.fixture(autouse=True)
  18. def _cleanup():
  19. yield
  20. gc.collect()
  21. torch.cuda.empty_cache()
  22. @triton.jit
  23. def _uniform_to_exponential_kernel(input, output, n: tl.constexpr):
  24. idx = tl.arange(0, n)
  25. x = tl.load(input + idx)
  26. y = _uniform_to_exponential(x)
  27. tl.store(output + idx, y)
  28. def test_uniform_to_exponential():
  29. """Test that we can convert uniform to exponential without div by 0."""
  30. input = torch.tensor([0.0, 1.0 - torch.finfo(torch.float32).eps],
  31. dtype=torch.float32,
  32. device="cuda")
  33. output = torch.zeros(input.shape, dtype=torch.float32, device="cuda")
  34. _uniform_to_exponential_kernel[(1, )](input, output, 2)
  35. assert torch.all(torch.isfinite(output))
  36. assert torch.all(output > 0)
  37. assert torch.all(torch.isfinite(torch.full_like(output, 1.0) / output))
  38. @pytest.mark.parametrize("random_sampling", [True, False, "mixed"])
  39. @pytest.mark.parametrize("max_best_of", [1, 2, 3, 4, 5])
  40. @pytest.mark.parametrize("modify_greedy_probs", [True, False])
  41. @pytest.mark.parametrize("seed", [1337])
  42. @pytest.mark.parametrize("vocab_size",
  43. [SINGLE_SPLIT_VOCAB_SIZE, MULTI_SPLIT_VOCAB_SIZE])
  44. @pytest.mark.parametrize("save_logprobs", [True, False])
  45. def test_sample_decoding_only(random_sampling, max_best_of,
  46. modify_greedy_probs, seed, vocab_size,
  47. save_logprobs):
  48. set_random_seed(seed)
  49. bs = 8
  50. probs = torch.zeros((bs, vocab_size), dtype=torch.float32, device="cuda")
  51. for i in range(bs):
  52. probs[i, i * (vocab_size // bs)] = 1.0
  53. logprobs = torch.rand_like(probs)
  54. sample_indices = torch.arange(bs, dtype=torch.long, device="cuda")
  55. n_splits = get_num_triton_sampler_splits(probs.shape[1])
  56. if random_sampling == "mixed":
  57. random_sampling_mask = (torch.rand(
  58. (1, bs), device="cuda") < 0.5).expand(n_splits, bs)
  59. elif random_sampling:
  60. random_sampling_mask = torch.ones((n_splits, bs),
  61. dtype=torch.bool,
  62. device="cuda")
  63. else:
  64. random_sampling_mask = torch.zeros((n_splits, bs),
  65. dtype=torch.bool,
  66. device="cuda")
  67. seeds = torch.randint(1,
  68. torch.iinfo(torch.long).max, (n_splits, bs),
  69. device="cuda").mul_(random_sampling_mask)
  70. #The current _sample_triton does not utilize the
  71. # libentry decoration. The purpose of adding this patch is to test
  72. # the correctness of libentry.
  73. with patch("aphrodite.model_executor.layers.ops.sample._sample_triton",
  74. LibEntry(_sample_triton)):
  75. sampled_tokens, sampled_logprobs, sampled_modified_probs = sample(
  76. probs=probs,
  77. logprobs=logprobs,
  78. sample_indices=sample_indices,
  79. seeds=seeds,
  80. max_best_of=max_best_of,
  81. modify_greedy_probs=modify_greedy_probs,
  82. save_logprobs=save_logprobs,
  83. _save_modified_probs=True)
  84. assert sampled_tokens.shape == (bs, max_best_of)
  85. for i in range(bs):
  86. assert torch.all(sampled_tokens[i] == i * (vocab_size // bs))
  87. request_uses_random_sampling = random_sampling_mask[0, i]
  88. if modify_greedy_probs and not request_uses_random_sampling:
  89. # If we are modifying greedy probs and the request is greedy,
  90. # we want to make sure the probs tensor is modified in place
  91. torch.testing.assert_close(
  92. probs[i][sampled_tokens[i]],
  93. torch.full_like(probs[i][sampled_tokens[i]], 1.0))
  94. assert torch.sum(probs[i]) == 1.0
  95. torch.testing.assert_close(
  96. sampled_modified_probs[i][0],
  97. torch.full_like(sampled_modified_probs[i][0], 1.0))
  98. elif request_uses_random_sampling:
  99. # If the request is random, we want to make sure
  100. # sampled_modified_probs tensor has noise added
  101. # (and thus is different from probs tensor)
  102. assert not torch.allclose(sampled_modified_probs[i][0],
  103. probs[i][sampled_tokens[i]])
  104. elif not request_uses_random_sampling:
  105. # If the request is greedy and we are not modifying greedy probs,
  106. # we want to make sure sampled_modified_probs tensor is the same as
  107. # the probs tensor.
  108. torch.testing.assert_close(sampled_modified_probs[i],
  109. probs[i][sampled_tokens[i]])
  110. if save_logprobs:
  111. assert sampled_logprobs.shape == (bs, max_best_of)
  112. for i in range(bs):
  113. for best_of in range(max_best_of):
  114. assert torch.all(sampled_logprobs[i] == logprobs[i][
  115. sampled_tokens[i, best_of]])
  116. else:
  117. assert sampled_logprobs is None
  118. @pytest.mark.parametrize("random_sampling", [True, False, "mixed"])
  119. @pytest.mark.parametrize("max_best_of", [1, 2, 3, 4, 5])
  120. @pytest.mark.parametrize("modify_greedy_probs", [True, False])
  121. @pytest.mark.parametrize("seed", [1337])
  122. @pytest.mark.parametrize("vocab_size",
  123. [SINGLE_SPLIT_VOCAB_SIZE, MULTI_SPLIT_VOCAB_SIZE])
  124. def test_sample_prompt_logprobs(random_sampling, max_best_of,
  125. modify_greedy_probs, seed, vocab_size):
  126. set_random_seed(seed)
  127. prompt_sizes = [16, 32, 64, 128] * 2
  128. samples = 8
  129. bs = samples + sum(prompt_sizes)
  130. probs = torch.zeros((bs, vocab_size), dtype=torch.float32, device="cuda")
  131. for i in range(bs):
  132. probs[i, i * (vocab_size // bs)] = 1.0
  133. logprobs = torch.rand_like(probs)
  134. sample_indices = torch.tensor(prompt_sizes,
  135. dtype=torch.long,
  136. device="cuda").cumsum_(0)
  137. n_splits = get_num_triton_sampler_splits(probs.shape[1])
  138. if random_sampling == "mixed":
  139. random_sampling_mask = torch.rand(
  140. (n_splits, samples), device="cuda") < 0.5
  141. elif random_sampling:
  142. random_sampling_mask = torch.ones((n_splits, samples),
  143. dtype=torch.bool,
  144. device="cuda")
  145. else:
  146. random_sampling_mask = torch.zeros((n_splits, samples),
  147. dtype=torch.bool,
  148. device="cuda")
  149. seeds = torch.randint(1,
  150. torch.iinfo(torch.long).max, (n_splits, samples),
  151. device="cuda").mul_(random_sampling_mask)
  152. #ditto
  153. with patch("aphrodite.model_executor.layers.ops.sample._sample_triton",
  154. LibEntry(_sample_triton)):
  155. sampled_tokens, sampled_logprobs, _ = sample(
  156. probs=probs,
  157. logprobs=logprobs,
  158. sample_indices=sample_indices,
  159. seeds=seeds,
  160. max_best_of=max_best_of,
  161. modify_greedy_probs=modify_greedy_probs,
  162. save_logprobs=True)
  163. assert sampled_tokens.shape == (samples, max_best_of)
  164. assert sampled_logprobs.shape == (samples, max_best_of)
  165. for i, t in enumerate(sample_indices):
  166. assert torch.all(sampled_tokens[i] == t * (vocab_size // bs))
  167. for best_of in range(max_best_of):
  168. assert torch.all(sampled_logprobs[i] == logprobs[sample_indices[i]]
  169. [sampled_tokens[i, best_of]])
  170. @pytest.mark.parametrize("seed", list(range(16)))
  171. def test_get_sequence_seeds(seed):
  172. """Ensure that we get a different child seed from base
  173. seed + extra entropy"""
  174. starting_seed = seed
  175. seq_seed = None
  176. extra_entropy = 1
  177. for i in range(512):
  178. new_seq_seed = SamplingTensors._get_sequence_seeds(starting_seed,
  179. i,
  180. seeds_to_generate=1,
  181. is_greedy=False)[0]
  182. new_seq_seed_extra_entropy = SamplingTensors._get_sequence_seeds(
  183. starting_seed,
  184. i,
  185. extra_entropy,
  186. seeds_to_generate=1,
  187. is_greedy=False)[0]
  188. assert new_seq_seed_extra_entropy != new_seq_seed
  189. assert seq_seed != new_seq_seed
  190. seq_seed = new_seq_seed