Browse Source

feat: add prometheus production metrics (#154)

AlpinDale 1 year ago
parent
commit
81e7981dce

+ 7 - 0
aphrodite/endpoints/openai/api_server.py

@@ -9,6 +9,8 @@ import time
 from http import HTTPStatus
 from typing import AsyncGenerator, Dict, List, Optional, Tuple, Union
 
+from aioprometheus import MetricsMiddleware
+from aioprometheus.asgi.starlette import metrics
 import fastapi
 import uvicorn
 from fastapi import Request, Response, Header, HTTPException, Depends
@@ -18,6 +20,7 @@ from fastapi.responses import JSONResponse, StreamingResponse
 
 from aphrodite.engine.args_tools import AsyncEngineArgs
 from aphrodite.engine.async_aphrodite import AsyncAphrodite
+from aphrodite.engine.metrics import add_global_metrics_labels
 from aphrodite.endpoints.openai.protocol import (
     CompletionRequest, CompletionResponse, CompletionResponseChoice,
     CompletionResponseStreamChoice, CompletionStreamResponse,
@@ -89,6 +92,8 @@ def parse_args():
     parser = AsyncEngineArgs.add_cli_args(parser)
     return parser.parse_args()
 
+app.add_middleware(MetricsMiddleware) # trace HTTP server metrics
+app.add_route("/metrics", metrics)
 
 def _verify_api_key(x_api_key: str = Header(None),
                     authorization: str = Header(None)):
@@ -749,6 +754,8 @@ if __name__ == "__main__":
 
     load_chat_template(args, tokenizer)
 
+    add_global_metrics_labels(model_name=engine_args.model)
+    
     uvicorn.run(app,
                 host=args.host,
                 port=args.port,

+ 13 - 2
aphrodite/engine/aphrodite_engine.py

@@ -7,6 +7,7 @@ from aphrodite.common.config import (CacheConfig, ModelConfig, ParallelConfig,
                                      SchedulerConfig)
 from aphrodite.processing.scheduler import Scheduler, SchedulerOutputs
 from aphrodite.engine.args_tools import EngineArgs
+from aphrodite.engine.metrics import record_metrics
 from aphrodite.engine.ray_tools import RayWorker, initialize_cluster, ray
 from aphrodite.common.logger import init_logger
 from aphrodite.common.outputs import RequestOutput
@@ -592,8 +593,8 @@ class AphroditeEngine:
         else:
             self.num_generation_tokens.append((now, num_batched_tokens))
 
-        elapsed_time = now - self.last_logging_time
-        if elapsed_time < _LOGGING_INTERVAL_SEC:
+        should_log = now - self.last_logging_time >= _LOGGING_INTERVAL_SEC
+        if not should_log:
             return
 
         # Discard the old stats.
@@ -632,6 +633,16 @@ class AphroditeEngine:
         else:
             cpu_cache_usage = 0.0
 
+        record_metrics(
+            avg_prompt_throughput=avg_prompt_throughput,
+            avg_generation_throughput=avg_generation_throughput,
+            scheduler_running=len(self.scheduler.running),
+            scheduler_swapped=len(self.scheduler.swapped),
+            scheduler_waiting=len(self.scheduler.waiting),
+            gpu_cache_usage=gpu_cache_usage,
+            cpu_cache_usage=cpu_cache_usage,
+        )
+
         logger.info("Avg prompt throughput: "
                     f"{avg_prompt_throughput:.1f} tokens/s, "
                     "Avg generation throughput: "

+ 52 - 0
aphrodite/engine/metrics.py

@@ -0,0 +1,52 @@
+from aioprometheus import Gauge
+
+# The begin-* and end* here are used by the documentation generator
+# to extract the metrics definitions.
+
+# begin-metrics-definitions
+gauge_avg_prompt_throughput = Gauge(
+    "aphrodite:avg_prompt_throughput_toks_per_s",
+    "Average prefill throughput in tokens/s.")
+gauge_avg_generation_throughput = Gauge(
+    "aphrodite:avg_generation_throughput_toks_per_s",
+    "Average generation throughput in tokens/s.")
+
+gauge_scheduler_running = Gauge(
+    "aphrodite:num_requests_running",
+    "Number of requests that is currently running for inference.")
+gauge_scheduler_swapped = Gauge("aphrodite:num_requests_swapped",
+                                "Number requests swapped to CPU.")
+gauge_scheduler_waiting = Gauge("aphrodite:num_requests_waiting",
+                                "Number of requests waiting to be processed.")
+
+gauge_gpu_cache_usage = Gauge(
+    "aphrodite:gpu_cache_usage_perc",
+    "GPU KV-cache usage. 1 means 100 percent usage.")
+gauge_cpu_cache_usage = Gauge(
+    "aphrodite:cpu_cache_usage_perc",
+    "CPU KV-cache usage. 1 means 100 percent usage.")
+# end-metrics-definitions
+
+labels = {}
+
+
+def add_global_metrics_labels(**kwargs):
+    labels.update(kwargs)
+
+
+def record_metrics(
+    avg_prompt_throughput: float,
+    avg_generation_throughput: float,
+    scheduler_running: int,
+    scheduler_swapped: int,
+    scheduler_waiting: int,
+    gpu_cache_usage: float,
+    cpu_cache_usage: float,
+):
+    gauge_avg_prompt_throughput.set(labels, avg_prompt_throughput)
+    gauge_avg_generation_throughput.set(labels, avg_generation_throughput)
+    gauge_scheduler_running.set(labels, scheduler_running)
+    gauge_scheduler_swapped.set(labels, scheduler_swapped)
+    gauge_scheduler_waiting.set(labels, scheduler_waiting)
+    gauge_gpu_cache_usage.set(labels, gpu_cache_usage)
+    gauge_cpu_cache_usage.set(labels, cpu_cache_usage)

+ 2 - 1
requirements.txt

@@ -15,4 +15,5 @@ pyarrow # needed for ray
 pandas
 fastapi
 colorlog
-einops # for phi
+einops # for phi
+aioprometheus[starlette] # for prometheus metrics