metrics.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254
  1. import time
  2. from dataclasses import dataclass
  3. from typing import TYPE_CHECKING, Dict, List, Optional, Protocol
  4. import numpy as np
  5. from prometheus_client import (REGISTRY, Counter, Gauge, Histogram, Info,
  6. disable_created_metrics)
  7. from loguru import logger
  8. if TYPE_CHECKING:
  9. from aphrodite.spec_decode.metrics import SpecDecodeWorkerMetrics
  10. disable_created_metrics()
  11. # The begin-* and end* here are used by the documentation generator
  12. # to extract the metrics definitions.
  13. # begin-metrics-definitions
  14. class Metrics:
  15. def __init__(self, labelnames: List[str]):
  16. # Unregister any existing Aphrodite collectors
  17. for collector in list(REGISTRY._collector_to_names):
  18. if hasattr(collector, "_name") and "aphrodite" in collector._name:
  19. REGISTRY.unregister(collector)
  20. # Config Information
  21. self.info_cache_config = Info(
  22. name='aphrodite:cache_config',
  23. documentation='information of cache_config')
  24. # System stats
  25. self.gauge_scheduler_running = Gauge(
  26. name="aphrodite:num_requests_running",
  27. documentation="Number of requests currently running on GPU.",
  28. labelnames=labelnames)
  29. self.gauge_scheduler_swapped = Gauge(
  30. name="aphrodite:num_requests_swapped",
  31. documentation="Number of requests swapped to CPU.",
  32. labelnames=labelnames)
  33. self.gauge_scheduler_waiting = Gauge(
  34. name="aphrodite:num_requests_waiting",
  35. documentation="Number of requests waiting to be processed.",
  36. labelnames=labelnames)
  37. self.gauge_gpu_cache_usage = Gauge(
  38. name="aphrodite:gpu_cache_usage_perc",
  39. documentation="GPU KV-cache usage. 1 means 100 percent usage.",
  40. labelnames=labelnames)
  41. self.gauge_cpu_cache_usage = Gauge(
  42. name="aphrodite:cpu_cache_usage_perc",
  43. documentation="CPU KV-cache usage. 1 means 100 percent usage.",
  44. labelnames=labelnames)
  45. # Raw stats from last model iteration
  46. self.counter_prompt_tokens = Counter(
  47. name="aphrodite:prompt_tokens_total",
  48. documentation="Number of prefill tokens processed.",
  49. labelnames=labelnames)
  50. self.counter_generation_tokens = Counter(
  51. name="aphrodite:generation_tokens_total",
  52. documentation="Number of generation tokens processed.",
  53. labelnames=labelnames)
  54. self.histogram_time_to_first_token = Histogram(
  55. name="aphrodite:time_to_first_token_seconds",
  56. documentation="Histogram of time to first token in seconds.",
  57. labelnames=labelnames,
  58. buckets=[
  59. 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
  60. 0.75, 1.0, 2.5, 5.0, 7.5, 10.0
  61. ])
  62. self.histogram_time_per_output_token = Histogram(
  63. name="aphrodite:time_per_output_token_seconds",
  64. documentation="Histogram of time per output token in seconds.",
  65. labelnames=labelnames,
  66. buckets=[
  67. 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
  68. 1.0, 2.5
  69. ])
  70. self.histogram_e2e_request_latency = Histogram(
  71. name="aphrodite:e2e_request_latency_seconds",
  72. documentation="Histogram of end to end request latency in seconds.",
  73. labelnames=labelnames,
  74. buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0])
  75. # Legacy metrics
  76. self.gauge_avg_prompt_throughput = Gauge(
  77. name="aphrodite:avg_prompt_throughput_toks_per_s",
  78. documentation="Average prefill throughput in tokens/s.",
  79. labelnames=labelnames,
  80. )
  81. self.gauge_avg_generation_throughput = Gauge(
  82. name="aphrodite:avg_generation_throughput_toks_per_s",
  83. documentation="Average generation throughput in tokens/s.",
  84. labelnames=labelnames,
  85. )
  86. # end-metrics-definitions
  87. @dataclass
  88. class Stats:
  89. """Created by AphroditeEngine for use by StatLogger."""
  90. now: float
  91. # System stats.
  92. num_running: int
  93. num_waiting: int
  94. num_swapped: int
  95. gpu_cache_usage: float
  96. cpu_cache_usage: float
  97. # Raw stats from last model iteration.
  98. num_prompt_tokens: int
  99. num_generation_tokens: int
  100. time_to_first_tokens: List[float]
  101. time_per_output_tokens: List[float]
  102. time_e2e_requests: List[float]
  103. spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None
  104. class SupportsMetricsInfo(Protocol):
  105. def metrics_info(self) -> Dict[str, str]:
  106. ...
  107. class StatLogger:
  108. """StatLogger is used AphroditeEngine to log to Promethus and Stdout."""
  109. def __init__(self, local_interval: float, labels: Dict[str, str]) -> None:
  110. # Metadata for logging locally.
  111. self.last_local_log = time.time()
  112. self.local_interval = local_interval
  113. # Tracked stats over current local logging interval.
  114. self.num_prompt_tokens: List[int] = []
  115. self.num_generation_tokens: List[int] = []
  116. # Prometheus metrics
  117. self.labels = labels
  118. self.metrics = Metrics(labelnames=list(labels.keys()))
  119. def info(self, type: str, obj: SupportsMetricsInfo) -> None:
  120. if type == "cache_config":
  121. self.metrics.info_cache_config.info(obj.metrics_info())
  122. def _get_throughput(self, tracked_stats: List[int], now: float) -> float:
  123. return float(np.sum(tracked_stats) / (now - self.last_local_log))
  124. def _local_interval_elapsed(self, now: float) -> bool:
  125. elapsed_time = now - self.last_local_log
  126. return elapsed_time > self.local_interval
  127. def _log_prometheus(self, stats: Stats) -> None:
  128. # Set system stat gauges.
  129. self.metrics.gauge_scheduler_running.labels(**self.labels).set(
  130. stats.num_running)
  131. self.metrics.gauge_scheduler_swapped.labels(**self.labels).set(
  132. stats.num_swapped)
  133. self.metrics.gauge_scheduler_waiting.labels(**self.labels).set(
  134. stats.num_waiting)
  135. self.metrics.gauge_gpu_cache_usage.labels(**self.labels).set(
  136. stats.gpu_cache_usage)
  137. self.metrics.gauge_cpu_cache_usage.labels(**self.labels).set(
  138. stats.cpu_cache_usage)
  139. # Add to token counters.
  140. self.metrics.counter_prompt_tokens.labels(**self.labels).inc(
  141. stats.num_prompt_tokens)
  142. self.metrics.counter_generation_tokens.labels(**self.labels).inc(
  143. stats.num_generation_tokens)
  144. # Observe request level latencies in histograms.
  145. for ttft in stats.time_to_first_tokens:
  146. self.metrics.histogram_time_to_first_token.labels(
  147. **self.labels).observe(ttft)
  148. for tpot in stats.time_per_output_tokens:
  149. self.metrics.histogram_time_per_output_token.labels(
  150. **self.labels).observe(tpot)
  151. for e2e in stats.time_e2e_requests:
  152. self.metrics.histogram_e2e_request_latency.labels(
  153. **self.labels).observe(e2e)
  154. def _log_prometheus_interval(self, prompt_throughput: float,
  155. generation_throughput: float) -> None:
  156. # Logs metrics to prometheus that are computed every logging_interval.
  157. # Support legacy gauge metrics that make throughput calculations on
  158. # the Aphrodite side. Moving forward, we should use counters like
  159. # counter_prompt_tokens, counter_generation_tokens
  160. # Which log raw data and calculate summaries using rate() on the
  161. # grafana/prometheus side.
  162. self.metrics.gauge_avg_prompt_throughput.labels(
  163. **self.labels).set(prompt_throughput)
  164. self.metrics.gauge_avg_generation_throughput.labels(
  165. **self.labels).set(generation_throughput)
  166. def log(self, stats: Stats) -> None:
  167. """Called by AphroditeEngine.
  168. Logs to prometheus and tracked stats every iteration.
  169. Logs to Stdout every self.local_interval seconds."""
  170. # Log to prometheus.
  171. self._log_prometheus(stats)
  172. # Save tracked stats for token counters.
  173. self.num_prompt_tokens.append(stats.num_prompt_tokens)
  174. self.num_generation_tokens.append(stats.num_generation_tokens)
  175. # Log locally every local_interval seconds.
  176. if self._local_interval_elapsed(stats.now):
  177. # Compute summary metrics for tracked stats (and log them
  178. # to promethus if applicable).
  179. prompt_throughput = self._get_throughput(self.num_prompt_tokens,
  180. now=stats.now)
  181. generation_throughput = self._get_throughput(
  182. self.num_generation_tokens, now=stats.now)
  183. self._log_prometheus_interval(
  184. prompt_throughput=prompt_throughput,
  185. generation_throughput=generation_throughput)
  186. # Log to stdout.
  187. logger.info(
  188. f"Avg prompt throughput: {prompt_throughput:.1f} tokens/s, "
  189. f"Avg generation throughput: "
  190. f"{generation_throughput:.1f} tokens/s, "
  191. f"Running: {stats.num_running} reqs, "
  192. f"Swapped: {stats.num_swapped} reqs, "
  193. f"Pending: {stats.num_waiting} reqs, "
  194. f"GPU KV cache usage: {stats.gpu_cache_usage * 100:.1f}%, "
  195. f"CPU KV cache usage: {stats.cpu_cache_usage * 100:.1f}%")
  196. # Reset tracked stats for next interval.
  197. self.num_prompt_tokens = []
  198. self.num_generation_tokens = []
  199. self.last_local_log = stats.now
  200. if stats.spec_decode_metrics is not None:
  201. logger.info(
  202. self._format_spec_decode_metrics_str(
  203. stats.spec_decode_metrics))
  204. def _format_spec_decode_metrics_str(
  205. self, metrics: "SpecDecodeWorkerMetrics") -> str:
  206. return ("Speculative metrics: "
  207. f"Draft acceptance rate: {metrics.draft_acceptance_rate:.3f}, "
  208. f"System efficiency: {metrics.system_efficiency:.3f}, "
  209. f"Number of speculative tokens: {metrics.num_spec_tokens}, "
  210. f"Number of accepted tokens: {metrics.accepted_tokens}, "
  211. f"Number of draft tokens tokens: {metrics.draft_tokens}, "
  212. f"Number of emitted tokens tokens: {metrics.emitted_tokens}.")