test_metrics.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394
  1. import time
  2. from typing import List
  3. import pytest
  4. import ray
  5. from prometheus_client import REGISTRY
  6. from aphrodite import AphroditeEngine, EngineArgs
  7. from aphrodite.common.sampling_params import SamplingParams
  8. from aphrodite.engine.args_tools import AsyncEngineArgs
  9. from aphrodite.engine.async_aphrodite import AsyncAphrodite
  10. from aphrodite.engine.metrics import RayPrometheusStatLogger
  11. from ..conftest import cleanup
  12. MODELS = [
  13. "facebook/opt-125m",
  14. ]
  15. @pytest.mark.parametrize("model", MODELS)
  16. @pytest.mark.parametrize("dtype", ["float"])
  17. @pytest.mark.parametrize("max_tokens", [128])
  18. def test_metric_counter_prompt_tokens(
  19. aphrodite_runner,
  20. example_prompts,
  21. model: str,
  22. dtype: str,
  23. max_tokens: int,
  24. ) -> None:
  25. with aphrodite_runner(model,
  26. dtype=dtype,
  27. disable_log_stats=False,
  28. gpu_memory_utilization=0.4) as aphrodite_model:
  29. tokenizer = aphrodite_model.model.get_tokenizer()
  30. prompt_token_counts = [
  31. len(tokenizer.encode(p)) for p in example_prompts
  32. ]
  33. # This test needs at least 2 prompts in a batch of different lengths to
  34. # verify their token count is correct despite padding.
  35. assert len(example_prompts) > 1, "at least 2 prompts are required"
  36. assert prompt_token_counts[0] != prompt_token_counts[1], (
  37. "prompts of different lengths are required")
  38. aphrodite_prompt_token_count = sum(prompt_token_counts)
  39. _ = aphrodite_model.generate_greedy(example_prompts, max_tokens)
  40. stat_logger = (
  41. aphrodite_model.model.llm_engine.stat_loggers['prometheus'])
  42. metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
  43. **stat_logger.labels)._value.get()
  44. assert aphrodite_prompt_token_count == metric_count, (
  45. f"prompt token count: {aphrodite_prompt_token_count!r}\n"
  46. f"metric: {metric_count!r}")
  47. @pytest.mark.parametrize("model", MODELS)
  48. @pytest.mark.parametrize("dtype", ["float"])
  49. @pytest.mark.parametrize("max_tokens", [128])
  50. def test_metric_counter_generation_tokens(
  51. aphrodite_runner,
  52. example_prompts,
  53. model: str,
  54. dtype: str,
  55. max_tokens: int,
  56. ) -> None:
  57. with aphrodite_runner(model,
  58. dtype=dtype,
  59. disable_log_stats=False,
  60. gpu_memory_utilization=0.4) as aphrodite_model:
  61. aphrodite_outputs = aphrodite_model.generate_greedy(
  62. example_prompts, max_tokens)
  63. tokenizer = aphrodite_model.model.get_tokenizer()
  64. stat_logger = (
  65. aphrodite_model.model.llm_engine.stat_loggers['prometheus'])
  66. metric_count = stat_logger.metrics.counter_generation_tokens.labels(
  67. **stat_logger.labels)._value.get()
  68. aphrodite_generation_count = 0
  69. for i in range(len(example_prompts)):
  70. aphrodite_output_ids, aphrodite_output_str = aphrodite_outputs[i]
  71. prompt_ids = tokenizer.encode(example_prompts[i])
  72. # aphrodite_output_ids contains both prompt tokens and generation
  73. # tokens. We're interested only in the count of the generation
  74. # tokens.
  75. aphrodite_generation_count += len(aphrodite_output_ids) - len(
  76. prompt_ids)
  77. assert aphrodite_generation_count == metric_count, (
  78. f"generation token count: {aphrodite_generation_count!r}\n"
  79. f"metric: {metric_count!r}")
  80. @pytest.mark.parametrize("model", MODELS)
  81. @pytest.mark.parametrize("dtype", ["float"])
  82. @pytest.mark.parametrize(
  83. "served_model_name",
  84. [None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]])
  85. def test_metric_set_tag_model_name(aphrodite_runner, model: str, dtype: str,
  86. served_model_name: List[str]) -> None:
  87. with aphrodite_runner(model,
  88. dtype=dtype,
  89. disable_log_stats=False,
  90. gpu_memory_utilization=0.3,
  91. served_model_name=served_model_name) as aphrodite_model:
  92. stat_logger = (
  93. aphrodite_model.model.llm_engine.stat_loggers['prometheus'])
  94. metrics_tag_content = stat_logger.labels["model_name"]
  95. if served_model_name is None or served_model_name == []:
  96. assert metrics_tag_content == model, (
  97. f"Metrics tag model_name is wrong! expect: {model!r}\n"
  98. f"actual: {metrics_tag_content!r}")
  99. else:
  100. assert metrics_tag_content == served_model_name[0], (
  101. f"Metrics tag model_name is wrong! expect: "
  102. f"{served_model_name[0]!r}\n"
  103. f"actual: {metrics_tag_content!r}")
  104. @pytest.mark.parametrize("model", MODELS)
  105. @pytest.mark.parametrize("dtype", ["half"])
  106. @pytest.mark.parametrize("max_tokens", [4])
  107. @pytest.mark.parametrize("disable_log_stats", [True, False])
  108. @pytest.mark.asyncio
  109. async def test_async_engine_log_metrics_regression(
  110. example_prompts,
  111. model: str,
  112. dtype: str,
  113. max_tokens: int,
  114. disable_log_stats: bool,
  115. ) -> None:
  116. """
  117. Regression test ensuring async engine generates metrics
  118. when disable_log_stats=False
  119. """
  120. engine_args = AsyncEngineArgs(model=model,
  121. dtype=dtype,
  122. disable_log_stats=disable_log_stats)
  123. async_engine = AsyncAphrodite.from_engine_args(engine_args)
  124. for i, prompt in enumerate(example_prompts):
  125. results = async_engine.generate(
  126. prompt,
  127. SamplingParams(max_tokens=max_tokens),
  128. f"request-id-{i}",
  129. )
  130. # Exhaust the async iterator to make the async engine work
  131. async for _ in results:
  132. pass
  133. assert_metrics(async_engine.engine, disable_log_stats,
  134. len(example_prompts))
  135. @pytest.mark.parametrize("model", MODELS)
  136. @pytest.mark.parametrize("dtype", ["half"])
  137. @pytest.mark.parametrize("max_tokens", [4])
  138. @pytest.mark.parametrize("disable_log_stats", [True, False])
  139. def test_engine_log_metrics_regression(
  140. example_prompts,
  141. model: str,
  142. dtype: str,
  143. max_tokens: int,
  144. disable_log_stats: bool,
  145. ) -> None:
  146. engine_args = EngineArgs(model=model,
  147. dtype=dtype,
  148. disable_log_stats=disable_log_stats)
  149. engine = AphroditeEngine.from_engine_args(engine_args)
  150. for i, prompt in enumerate(example_prompts):
  151. engine.add_request(
  152. f"request-id-{i}",
  153. prompt,
  154. SamplingParams(max_tokens=max_tokens),
  155. )
  156. while engine.has_unfinished_requests():
  157. engine.step()
  158. assert_metrics(engine, disable_log_stats, len(example_prompts))
  159. @pytest.mark.parametrize("model", MODELS)
  160. @pytest.mark.parametrize("dtype", ["half"])
  161. @pytest.mark.parametrize("max_tokens", [10])
  162. def test_metric_spec_decode(
  163. aphrodite_runner,
  164. example_prompts,
  165. model: str,
  166. dtype: str,
  167. max_tokens: int,
  168. ) -> None:
  169. k = 5
  170. with aphrodite_runner(model,
  171. dtype=dtype,
  172. disable_log_stats=False,
  173. gpu_memory_utilization=0.4,
  174. speculative_model=model,
  175. num_speculative_tokens=k,
  176. use_v2_block_manager=True) as aphrodite_model:
  177. # Force log interval to be 0 to catch all metrics.
  178. stat_logger = (
  179. aphrodite_model.model.llm_engine.stat_loggers['prometheus'])
  180. stat_logger.local_interval = 0
  181. # Note that the purpose of this test is to verify spec decode
  182. # metrics instead of functional correctness, so the expected values
  183. # are intended to be loose.
  184. metric_name_to_expected_fn = {
  185. "gauge_spec_decode_draft_acceptance_rate": lambda v: 0 <= v <= 1,
  186. "gauge_spec_decode_efficiency": lambda v: 0 <= v <= 1,
  187. "counter_spec_decode_num_accepted_tokens": lambda v: 0 <= v <= k,
  188. "counter_spec_decode_num_draft_tokens": lambda v: v == k,
  189. "counter_spec_decode_num_emitted_tokens":
  190. lambda v: 0 <= v <= k + 1,
  191. }
  192. # Use one request to better inspect the metrics.
  193. prompts = example_prompts[:1]
  194. _ = aphrodite_model.generate_greedy(prompts, max_tokens)
  195. for metric_name, is_expected in metric_name_to_expected_fn.items():
  196. metric_val = getattr(
  197. stat_logger.metrics,
  198. metric_name).labels(**stat_logger.labels)._value.get()
  199. assert is_expected(metric_val), (
  200. f"the value of metric {metric_name} ({metric_val}) "
  201. "does not meet expectation")
  202. @pytest.mark.parametrize("model", MODELS)
  203. @pytest.mark.parametrize("dtype", ["half"])
  204. @pytest.mark.parametrize("max_tokens", [10])
  205. @pytest.mark.parametrize("log_interval", [1, 3, 5, 7])
  206. def test_metric_spec_decode_interval(
  207. aphrodite_runner,
  208. example_prompts,
  209. model: str,
  210. dtype: str,
  211. max_tokens: int,
  212. log_interval: int,
  213. ) -> None:
  214. k = 5
  215. engine_args = EngineArgs(model=model,
  216. dtype=dtype,
  217. disable_log_stats=False,
  218. gpu_memory_utilization=0.4,
  219. speculative_model=model,
  220. num_speculative_tokens=k,
  221. use_v2_block_manager=True,
  222. enforce_eager=True)
  223. engine = AphroditeEngine.from_engine_args(engine_args)
  224. try:
  225. engine.add_request(
  226. "request-id-0",
  227. example_prompts[0],
  228. SamplingParams(max_tokens=max_tokens),
  229. )
  230. # set log internal
  231. stat_logger = engine.stat_loggers['prometheus']
  232. stat_logger.local_interval = log_interval
  233. # prefill
  234. engine.step()
  235. # wait for 5 seconds to ensure that spec decode metrics
  236. # get triggered in first decode step
  237. time.sleep(5)
  238. # first decode step should trigger async collection of metrics
  239. engine.step()
  240. # wait one second to allow H2D transfer to finish
  241. time.sleep(1)
  242. # second decode step should now be able to collect the spec
  243. # decode stats and the request should also be finished
  244. engine.step()
  245. # must have finisehd now
  246. assert not engine.has_unfinished_requests()
  247. # wait to ensure logging occurs
  248. time.sleep(log_interval)
  249. # force logging
  250. engine.step()
  251. # Note that the purpose of this test is to verify spec decode
  252. # metrics instead of functional correctness, so the expected values
  253. # are intended to be loose.
  254. metric_name_to_expected_fn = {
  255. "gauge_spec_decode_draft_acceptance_rate": lambda v: 0 <= v <= 1,
  256. "gauge_spec_decode_efficiency": lambda v: 0 <= v <= 1,
  257. "counter_spec_decode_num_accepted_tokens": lambda v: 0 <= v <= k,
  258. "counter_spec_decode_num_draft_tokens": lambda v: v == k,
  259. "counter_spec_decode_num_emitted_tokens":
  260. lambda v: 0 <= v <= k + 1,
  261. }
  262. for metric_name, is_expected in metric_name_to_expected_fn.items():
  263. metric_val = getattr(
  264. stat_logger.metrics,
  265. metric_name).labels(**stat_logger.labels)._value.get()
  266. assert is_expected(metric_val), (
  267. f"the value of metric {metric_name} ({metric_val}) "
  268. "does not meet expectation")
  269. finally:
  270. del engine
  271. cleanup()
  272. def assert_metrics(engine: AphroditeEngine, disable_log_stats: bool,
  273. num_requests: int) -> None:
  274. if disable_log_stats:
  275. with pytest.raises(AttributeError):
  276. _ = engine.stat_loggers
  277. else:
  278. assert (engine.stat_loggers
  279. is not None), "engine.stat_loggers should be set"
  280. # Ensure the count bucket of request-level histogram metrics matches
  281. # the number of requests as a simple sanity check to ensure metrics are
  282. # generated
  283. labels = {'model_name': engine.model_config.model}
  284. request_histogram_metrics = [
  285. "aphrodite:e2e_request_latency_seconds",
  286. "aphrodite:request_prompt_tokens",
  287. "aphrodite:request_generation_tokens",
  288. "aphrodite:request_params_best_of",
  289. "aphrodite:request_params_n",
  290. ]
  291. for metric_name in request_histogram_metrics:
  292. metric_value = REGISTRY.get_sample_value(f"{metric_name}_count",
  293. labels)
  294. assert (
  295. metric_value == num_requests), "Metrics should be collected"
  296. @pytest.mark.parametrize("model", MODELS)
  297. @pytest.mark.parametrize("dtype", ["half"])
  298. @pytest.mark.parametrize("max_tokens", [16])
  299. def test_engine_log_metrics_ray(
  300. example_prompts,
  301. model: str,
  302. dtype: str,
  303. max_tokens: int,
  304. ) -> None:
  305. # This test is quite weak - it only checks that we can use
  306. # RayPrometheusStatLogger without exceptions.
  307. # Checking whether the metrics are actually emitted is unfortunately
  308. # non-trivial.
  309. # We have to run in a Ray task for Ray metrics to be emitted correctly
  310. @ray.remote(num_gpus=1)
  311. def _inner():
  312. class _RayPrometheusStatLogger(RayPrometheusStatLogger):
  313. def __init__(self, *args, **kwargs):
  314. self._i = 0
  315. super().__init__(*args, **kwargs)
  316. def log(self, *args, **kwargs):
  317. self._i += 1
  318. return super().log(*args, **kwargs)
  319. engine_args = EngineArgs(
  320. model=model,
  321. dtype=dtype,
  322. disable_log_stats=False,
  323. )
  324. engine = AphroditeEngine.from_engine_args(engine_args)
  325. logger = _RayPrometheusStatLogger(
  326. local_interval=0.5,
  327. labels=dict(model_name=engine.model_config.served_model_name),
  328. max_model_len=engine.model_config.max_model_len)
  329. engine.add_logger("ray", logger)
  330. for i, prompt in enumerate(example_prompts):
  331. engine.add_request(
  332. f"request-id-{i}",
  333. prompt,
  334. SamplingParams(max_tokens=max_tokens),
  335. )
  336. while engine.has_unfinished_requests():
  337. engine.step()
  338. assert logger._i > 0, ".log must be called at least once"
  339. ray.get(_inner.remote())