test_metrics.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179
  1. from http import HTTPStatus
  2. import openai
  3. import pytest
  4. import requests
  5. from prometheus_client.parser import text_string_to_metric_families
  6. from transformers import AutoTokenizer
  7. from ...utils import RemoteOpenAIServer
  8. MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
  9. @pytest.fixture(scope="module")
  10. def default_server_args():
  11. return [
  12. # use half precision for speed and memory savings in CI environment
  13. "--dtype",
  14. "bfloat16",
  15. "--max-model-len",
  16. "1024",
  17. "--enforce-eager",
  18. "--max-num-seqs",
  19. "128",
  20. ]
  21. @pytest.fixture(scope="module",
  22. params=[
  23. "",
  24. "--enable-chunked-prefill",
  25. "--disable-frontend-multiprocessing",
  26. ])
  27. def client(default_server_args, request):
  28. if request.param:
  29. default_server_args.append(request.param)
  30. with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
  31. yield remote_server.get_async_client()
  32. _PROMPT = "Hello my name is Robert and I love magic"
  33. tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
  34. _TOKENIZED_PROMPT = tokenizer(_PROMPT)["input_ids"]
  35. _NUM_REQUESTS = 10
  36. _NUM_PROMPT_TOKENS_PER_REQUEST = len(_TOKENIZED_PROMPT)
  37. _NUM_GENERATION_TOKENS_PER_REQUEST = 10
  38. # {metric_family: [(suffix, expected_value)]}
  39. EXPECTED_VALUES = {
  40. "aphrodite:time_to_first_token_seconds": [("_count", _NUM_REQUESTS)],
  41. "aphrodite:time_per_output_token_seconds":
  42. [("_count", _NUM_REQUESTS * (_NUM_GENERATION_TOKENS_PER_REQUEST - 1))],
  43. "aphrodite:e2e_request_latency_seconds": [("_count", _NUM_REQUESTS)],
  44. "aphrodite:request_prompt_tokens":
  45. [("_sum", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST),
  46. ("_count", _NUM_REQUESTS)],
  47. "aphrodite:request_generation_tokens":
  48. [("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
  49. ("_count", _NUM_REQUESTS)],
  50. "aphrodite:request_params_n": [("_count", _NUM_REQUESTS)],
  51. "aphrodite:request_params_best_of": [("_count", _NUM_REQUESTS)],
  52. "aphrodite:prompt_tokens": [("_total",
  53. _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
  54. "aphrodite:generation_tokens":
  55. [("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
  56. "aphrodite:request_success": [("_total", _NUM_REQUESTS)],
  57. }
  58. @pytest.mark.asyncio
  59. async def test_metrics_counts(client: openai.AsyncOpenAI):
  60. base_url = str(client.base_url)[:-3].strip("/")
  61. for _ in range(_NUM_REQUESTS):
  62. # sending a request triggers the metrics to be logged.
  63. await client.completions.create(
  64. model=MODEL_NAME,
  65. prompt=_TOKENIZED_PROMPT,
  66. max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST)
  67. response = requests.get(base_url + "/metrics")
  68. print(response.text)
  69. assert response.status_code == HTTPStatus.OK
  70. # Loop over all expected metric_families
  71. for metric_family, suffix_values_list in EXPECTED_VALUES.items():
  72. found_metric = False
  73. # Check to see if the metric_family is found in the prom endpoint.
  74. for family in text_string_to_metric_families(response.text):
  75. if family.name == metric_family:
  76. found_metric = True
  77. # Check that each suffix is found in the prom endpoint.
  78. for suffix, expected_value in suffix_values_list:
  79. metric_name_w_suffix = f"{metric_family}{suffix}"
  80. found_suffix = False
  81. for sample in family.samples:
  82. if sample.name == metric_name_w_suffix:
  83. found_suffix = True
  84. # For each suffix, value sure the value matches
  85. # what we expect.
  86. assert sample.value == expected_value, (
  87. f"{metric_name_w_suffix} expected value of "
  88. f"{expected_value} did not match found value "
  89. f"{sample.value}")
  90. break
  91. assert found_suffix, (
  92. f"Did not find {metric_name_w_suffix} in prom endpoint"
  93. )
  94. break
  95. assert found_metric, (f"Did not find {metric_family} in prom endpoint")
  96. EXPECTED_METRICS = [
  97. "aphrodite:num_requests_running",
  98. "aphrodite:num_requests_swapped",
  99. "aphrodite:num_requests_waiting",
  100. "aphrodite:gpu_cache_usage_perc",
  101. "aphrodite:cpu_cache_usage_perc",
  102. "aphrodite:time_to_first_token_seconds_sum",
  103. "aphrodite:time_to_first_token_seconds_bucket",
  104. "aphrodite:time_to_first_token_seconds_count",
  105. "aphrodite:time_per_output_token_seconds_sum",
  106. "aphrodite:time_per_output_token_seconds_bucket",
  107. "aphrodite:time_per_output_token_seconds_count",
  108. "aphrodite:e2e_request_latency_seconds_sum",
  109. "aphrodite:e2e_request_latency_seconds_bucket",
  110. "aphrodite:e2e_request_latency_seconds_count",
  111. "aphrodite:request_prompt_tokens_sum",
  112. "aphrodite:request_prompt_tokens_bucket",
  113. "aphrodite:request_prompt_tokens_count",
  114. "aphrodite:request_generation_tokens_sum",
  115. "aphrodite:request_generation_tokens_bucket",
  116. "aphrodite:request_generation_tokens_count",
  117. "aphrodite:request_params_n_sum",
  118. "aphrodite:request_params_n_bucket",
  119. "aphrodite:request_params_n_count",
  120. "aphrodite:request_params_best_of_sum",
  121. "aphrodite:request_params_best_of_bucket",
  122. "aphrodite:request_params_best_of_count",
  123. "aphrodite:num_preemptions_total",
  124. "aphrodite:prompt_tokens_total",
  125. "aphrodite:generation_tokens_total",
  126. "aphrodite:request_success_total",
  127. "aphrodite:cache_config_info",
  128. # labels in cache_config_info
  129. "block_size",
  130. "cache_dtype",
  131. "cpu_offload_gb",
  132. "enable_prefix_caching",
  133. "gpu_memory_utilization",
  134. "num_cpu_blocks",
  135. "num_gpu_blocks",
  136. "num_gpu_blocks_override",
  137. "sliding_window",
  138. "swap_space_bytes",
  139. ]
  140. @pytest.mark.asyncio
  141. async def test_metrics_exist(client: openai.AsyncOpenAI):
  142. base_url = str(client.base_url)[:-3].strip("/")
  143. # sending a request triggers the metrics to be logged.
  144. await client.completions.create(model=MODEL_NAME,
  145. prompt="Hello, my name is",
  146. max_tokens=5,
  147. temperature=0.0)
  148. response = requests.get(base_url + "/metrics")
  149. assert response.status_code == HTTPStatus.OK
  150. for metric in EXPECTED_METRICS:
  151. assert metric in response.text