1
0

async_aphrodite.py 42 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023
  1. import asyncio
  2. import time
  3. import weakref
  4. from functools import partial
  5. from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List,
  6. Optional, Set, Tuple, Type, Union)
  7. from weakref import ReferenceType
  8. from loguru import logger
  9. import aphrodite.common.envs as envs
  10. from aphrodite.common.config import (DecodingConfig, EngineConfig, LoRAConfig,
  11. ModelConfig, ParallelConfig,
  12. SchedulerConfig)
  13. from aphrodite.common.outputs import EmbeddingRequestOutput, RequestOutput
  14. from aphrodite.common.pooling_params import PoolingParams
  15. from aphrodite.common.sampling_params import SamplingParams
  16. from aphrodite.common.sequence import ExecuteModelRequest
  17. from aphrodite.common.utils import weak_bind
  18. from aphrodite.engine.aphrodite_engine import (AphroditeEngine,
  19. SchedulerOutputState)
  20. from aphrodite.engine.args_tools import AsyncEngineArgs
  21. from aphrodite.engine.async_timeout import asyncio_timeout
  22. from aphrodite.engine.metrics_types import StatLoggerBase
  23. from aphrodite.executor.executor_base import ExecutorAsyncBase
  24. from aphrodite.executor.ray_utils import initialize_ray_cluster
  25. from aphrodite.inputs import PromptInputs
  26. from aphrodite.lora.request import LoRARequest
  27. from aphrodite.modeling.layers.sampler import SamplerOutput
  28. from aphrodite.processing.scheduler import SchedulerOutputs
  29. from aphrodite.prompt_adapter.request import PromptAdapterRequest
  30. from aphrodite.transformers_utils.tokenizer import AnyTokenizer
  31. ENGINE_ITERATION_TIMEOUT_S = envs.APHRODITE_ENGINE_ITERATION_TIMEOUT_S
  32. class AsyncEngineDeadError(RuntimeError):
  33. pass
  34. def _log_task_completion(task: asyncio.Task,
  35. error_callback: Callable[[Exception], None]) -> None:
  36. """This function is only intended for the `engine.run_engine_loop()` task.
  37. In particular, that task runs a `while True` loop that can only exit if
  38. there is an exception.
  39. """
  40. exception = None
  41. try:
  42. return_value = task.result()
  43. raise AssertionError(
  44. f"The engine background task should never finish without an "
  45. f"exception. {return_value}")
  46. except asyncio.exceptions.CancelledError:
  47. # We assume that if the task is cancelled, we are gracefully shutting
  48. # down. This should only happen on program exit.
  49. logger.info("Engine is gracefully shutting down.")
  50. except Exception as e:
  51. exception = e
  52. logger.error("Engine background task failed", exc_info=e)
  53. error_callback(exception)
  54. raise AsyncEngineDeadError(
  55. "Task finished unexpectedly. This should never happen! "
  56. "Please open an issue on Github. See stack trace above for the "
  57. "actual cause.") from e
  58. STOP_ITERATION = Exception() # Sentinel
  59. class AsyncStream:
  60. """A stream of RequestOutputs or EmbeddingRequestOutputs for a request
  61. that can be iterated over asynchronously via an async generator."""
  62. def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None:
  63. self.request_id = request_id
  64. self._cancel = cancel
  65. self._queue: asyncio.Queue = asyncio.Queue()
  66. self._finished = False
  67. def put(self, item: Union[RequestOutput, EmbeddingRequestOutput,
  68. Exception]) -> None:
  69. if not self._finished:
  70. self._queue.put_nowait(item)
  71. def finish(
  72. self,
  73. exception: Optional[Union[BaseException, Type[BaseException]]] = None,
  74. ) -> None:
  75. if not self._finished:
  76. self._finished = True
  77. self._queue.put_nowait(
  78. exception if self._is_raisable(exception) else STOP_ITERATION)
  79. @property
  80. def finished(self) -> bool:
  81. return self._finished
  82. async def generator(
  83. self
  84. ) -> AsyncGenerator[Union[RequestOutput, EmbeddingRequestOutput], None]:
  85. try:
  86. while True:
  87. result = await self._queue.get()
  88. if self._is_raisable(result):
  89. if result == STOP_ITERATION:
  90. return
  91. raise result
  92. yield result
  93. except GeneratorExit:
  94. self._cancel(self.request_id)
  95. raise asyncio.CancelledError from None
  96. @staticmethod
  97. def _is_raisable(value: Any):
  98. return isinstance(value, BaseException) or \
  99. (isinstance(value, type) and \
  100. issubclass(value, BaseException))
  101. class RequestTracker:
  102. """Synchronous abstraction for tracking requests."""
  103. def __init__(self) -> None:
  104. self._request_streams: Dict[str, AsyncStream] = {}
  105. self._aborted_requests: asyncio.Queue[str] = asyncio.Queue()
  106. self._new_requests: asyncio.Queue[Tuple[AsyncStream,
  107. dict]] = asyncio.Queue()
  108. self.new_requests_event = asyncio.Event()
  109. def __contains__(self, item):
  110. return item in self._request_streams
  111. def __len__(self) -> int:
  112. return len(self._request_streams)
  113. def propagate_exception(self,
  114. exc: Exception,
  115. request_id: Optional[str] = None) -> None:
  116. """Propagate an exception to request streams
  117. (all if request_id is None)."""
  118. if request_id is not None:
  119. self.abort_request(request_id, exception=exc)
  120. else:
  121. # NB: tuple() used here because self.abort_request pops the stream
  122. # out of self._request_streams, so we can't iterate on it directly
  123. for rid in tuple(self._request_streams.keys()):
  124. self.abort_request(rid, exception=exc)
  125. def process_request_output(self,
  126. request_output: Union[RequestOutput,
  127. EmbeddingRequestOutput],
  128. *,
  129. verbose: bool = False) -> None:
  130. """Process a request output from the engine."""
  131. request_id = request_output.request_id
  132. finished = request_output.finished
  133. if finished:
  134. stream = self._request_streams.pop(request_id, None)
  135. else:
  136. stream = self._request_streams.get(request_id)
  137. # Guard against a KeyError which can occur if the request was aborted
  138. # while the output was generated
  139. if stream is not None:
  140. stream.put(request_output)
  141. if finished:
  142. stream.finish()
  143. if verbose and finished:
  144. logger.info(f"Finished request {request_id}.")
  145. def process_exception(self,
  146. request_id: str,
  147. exception: BaseException,
  148. *,
  149. verbose: bool = False) -> None:
  150. """Propagate an exception from the engine."""
  151. if verbose:
  152. logger.info(f"Finished request {request_id}.")
  153. self.abort_request(request_id, exception=exception)
  154. def add_request(self,
  155. request_id: str,
  156. *,
  157. verbose: bool = False,
  158. **engine_add_request_kwargs) -> AsyncStream:
  159. """Add a request to be sent to the engine on the next background
  160. loop iteration."""
  161. if request_id in self._request_streams:
  162. raise KeyError(f"Request {request_id} already exists.")
  163. abort_request = partial(self.abort_request, verbose=verbose)
  164. stream = AsyncStream(request_id, abort_request)
  165. self._new_requests.put_nowait((stream, {
  166. "request_id": request_id,
  167. **engine_add_request_kwargs
  168. }))
  169. self.new_requests_event.set()
  170. if verbose:
  171. logger.info(f"Added request {request_id}.")
  172. return stream
  173. def abort_request(self,
  174. request_id: str,
  175. *,
  176. exception: Optional[Union[BaseException,
  177. Type[BaseException]]] = None,
  178. verbose: bool = False) -> None:
  179. """Abort a request during next background loop iteration."""
  180. if verbose:
  181. logger.info(f"Aborted request {request_id}.")
  182. self._aborted_requests.put_nowait(request_id)
  183. stream = self._request_streams.pop(request_id, None)
  184. if stream is not None:
  185. stream.finish(exception=exception)
  186. def get_new_and_aborted_requests(self) -> Tuple[List[Dict], Set[str]]:
  187. """Get the new requests and finished requests to be
  188. sent to the engine."""
  189. new_requests: List[Dict] = []
  190. finished_requests: Set[str] = set()
  191. while not self._aborted_requests.empty():
  192. request_id = self._aborted_requests.get_nowait()
  193. finished_requests.add(request_id)
  194. while not self._new_requests.empty():
  195. stream, new_request = self._new_requests.get_nowait()
  196. request_id = stream.request_id
  197. if request_id in finished_requests:
  198. # The request has already been aborted.
  199. stream.finish(asyncio.CancelledError)
  200. finished_requests.discard(request_id)
  201. else:
  202. self._request_streams[request_id] = stream
  203. new_requests.append(new_request)
  204. return new_requests, finished_requests
  205. async def wait_for_new_requests(self):
  206. if not self.has_new_requests():
  207. await self.new_requests_event.wait()
  208. self.new_requests_event.clear()
  209. def has_new_requests(self):
  210. return not self._new_requests.empty()
  211. class _AsyncAphrodite(AphroditeEngine):
  212. """Extension of AphroditeEngine to add async methods."""
  213. def __init__(self, *args, **kwargs):
  214. super().__init__(*args, **kwargs)
  215. async def step_async(
  216. self, virtual_engine: int
  217. ) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
  218. """Performs one decoding iteration and returns newly generated results.
  219. The workers are ran asynchronously if possible.
  220. This function performs one decoding iteration of the engine. It first
  221. schedules the sequences to be executed in the next iteration and the
  222. token blocks to be swapped in/out/copy. Then, it executes the model
  223. and updates the scheduler with the model outputs. Finally, it decodes
  224. the sequences and returns the newly generated results.
  225. """
  226. # these are cached outputs from previous iterations. None if on first
  227. # iteration
  228. cached_outputs = self.cached_scheduler_outputs[virtual_engine]
  229. seq_group_metadata_list = cached_outputs.seq_group_metadata_list
  230. scheduler_outputs = cached_outputs.scheduler_outputs
  231. allow_async_output_proc = cached_outputs.allow_async_output_proc
  232. ctx = self.scheduler_contexts[virtual_engine]
  233. # Clear outputs for each new scheduler iteration
  234. ctx.request_outputs.clear()
  235. # skip the scheduler if there are any remaining steps in the seq groups.
  236. # This ensures that the scheduler is only called again when the current
  237. # batch has completed.
  238. if not self._has_remaining_steps(seq_group_metadata_list):
  239. # Schedule iteration
  240. (seq_group_metadata_list, scheduler_outputs,
  241. allow_async_output_proc
  242. ) = self.scheduler[virtual_engine].schedule()
  243. ctx.seq_group_metadata_list = seq_group_metadata_list
  244. ctx.scheduler_outputs = scheduler_outputs
  245. # Maybe switch from async mode to sync mode
  246. if not allow_async_output_proc and len(ctx.output_queue) > 0:
  247. self._process_model_outputs(ctx=ctx)
  248. if (self.scheduler_config.is_multi_step
  249. and scheduler_outputs.num_lookahead_slots > 0):
  250. # cache the scheduler outputs for the next iteration if we have
  251. # lookahead slots
  252. self._cache_scheduler_outputs_for_multi_step(
  253. virtual_engine, seq_group_metadata_list, scheduler_outputs,
  254. allow_async_output_proc)
  255. assert seq_group_metadata_list is not None
  256. assert scheduler_outputs is not None
  257. if not scheduler_outputs.is_empty():
  258. finished_requests_ids = self.scheduler[
  259. virtual_engine].get_and_reset_finished_requests_ids()
  260. # Check if we have a cached last_output from the previous iteration.
  261. # For supporting PP this is probably the best way to pass the
  262. # sampled_token_ids, as a separate broadcast over all the PP stages
  263. # will cause one virtual engine's microbatch to block the pipeline.
  264. last_sampled_token_ids = \
  265. self._get_last_sampled_token_ids(virtual_engine)
  266. execute_model_req = ExecuteModelRequest(
  267. seq_group_metadata_list=seq_group_metadata_list,
  268. blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
  269. blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
  270. blocks_to_copy=scheduler_outputs.blocks_to_copy,
  271. virtual_engine=virtual_engine,
  272. num_lookahead_slots=scheduler_outputs.num_lookahead_slots,
  273. running_queue_size=scheduler_outputs.running_queue_size,
  274. finished_requests_ids=finished_requests_ids,
  275. # We use ExecuteModelRequest to pass the last sampled_token_ids
  276. # to each of the non-last PP stages for in-place prepare_input.
  277. last_sampled_token_ids=last_sampled_token_ids)
  278. if allow_async_output_proc:
  279. execute_model_req.async_callback = self.async_callbacks[
  280. virtual_engine]
  281. # Execute the model.
  282. outputs = await self.model_executor.execute_model_async(
  283. execute_model_req)
  284. # we need to do this here so that last step's sampled_token_ids can
  285. # be passed to the next iteration for PP.
  286. if self.scheduler_config.is_multi_step:
  287. self._update_cached_scheduler_output(virtual_engine, outputs)
  288. else:
  289. if len(ctx.output_queue) > 0:
  290. self._process_model_outputs(ctx=ctx)
  291. outputs = []
  292. # Finish the current step for all the sequence groups.
  293. if self.scheduler_config.is_multi_step:
  294. for seq_group in seq_group_metadata_list:
  295. seq_group.finish_step()
  296. if not self._has_remaining_steps(seq_group_metadata_list):
  297. # Clear the cache if we have finished all the steps
  298. if self.scheduler_config.is_multi_step:
  299. self.cached_scheduler_outputs[
  300. virtual_engine] = SchedulerOutputState()
  301. ctx.append_output(outputs=outputs,
  302. seq_group_metadata_list=seq_group_metadata_list,
  303. scheduler_outputs=scheduler_outputs,
  304. is_async=allow_async_output_proc,
  305. is_last_step=True)
  306. if outputs and allow_async_output_proc:
  307. assert len(
  308. outputs
  309. ) == 1, "Async postprocessor expects only a single output set"
  310. self._advance_to_next_step(
  311. outputs[0], seq_group_metadata_list,
  312. scheduler_outputs.scheduled_seq_groups)
  313. if not allow_async_output_proc:
  314. self._process_model_outputs(ctx=ctx)
  315. # Log stats.
  316. self.do_log_stats(scheduler_outputs, outputs)
  317. else:
  318. # Multi-step case
  319. return ctx.request_outputs
  320. if not self.has_unfinished_requests():
  321. # Drain async postprocessor (if exists)
  322. if len(ctx.output_queue) > 0:
  323. self._process_model_outputs(ctx=ctx)
  324. assert len(ctx.output_queue) == 0
  325. return ctx.request_outputs
  326. async def stop_remote_worker_execution_loop_async(self) -> None:
  327. """Stop the remote worker execution loop."""
  328. await self.model_executor.stop_remote_worker_execution_loop_async()
  329. async def add_request_async(
  330. self,
  331. request_id: str,
  332. inputs: PromptInputs,
  333. params: Union[SamplingParams, PoolingParams],
  334. arrival_time: Optional[float] = None,
  335. lora_request: Optional[LoRARequest] = None,
  336. prompt_adapter_request: Optional[PromptAdapterRequest] = None,
  337. ) -> None:
  338. """Async version of :meth:`add_request`."""
  339. if lora_request is not None and not self.lora_config:
  340. raise ValueError(f"Got lora_request {lora_request} but LoRA is "
  341. "not enabled!")
  342. if arrival_time is None:
  343. arrival_time = time.time()
  344. preprocessed_inputs = await self.input_preprocessor.preprocess_async(
  345. inputs,
  346. request_id=request_id,
  347. lora_request=lora_request,
  348. prompt_adapter_request=prompt_adapter_request,
  349. )
  350. processed_inputs = self.input_processor(preprocessed_inputs)
  351. self._add_processed_request(
  352. request_id=request_id,
  353. processed_inputs=processed_inputs,
  354. params=params,
  355. arrival_time=arrival_time,
  356. lora_request=lora_request,
  357. prompt_adapter_request=prompt_adapter_request,
  358. )
  359. async def check_health_async(self) -> None:
  360. if self.tokenizer:
  361. self.tokenizer.check_health()
  362. self.model_executor.check_health()
  363. class AsyncAphrodite:
  364. """An asynchronous wrapper for :class:`AphroditeEngine`.
  365. This class is used to wrap the :class:`AphroditeEngine` class to make it
  366. asynchronous. It uses asyncio to create a background loop that keeps
  367. processing incoming requests. The :class:`AphroditeEngine` is kicked by the
  368. generate method when there are requests in the waiting queue. The generate
  369. method yields the outputs from the :class:`AphroditeEngine` to the caller.
  370. Args:.
  371. log_requests: Whether to log the requests.
  372. start_engine_loop: If True, the background task to run the engine
  373. will be automatically started in the generate call.
  374. *args: Arguments for :class:`AphroditeEngine`.
  375. **kwargs: Arguments for :class:`AphroditeEngine`.
  376. """
  377. _engine_class: Type[_AsyncAphrodite] = _AsyncAphrodite
  378. def __init__(self,
  379. *args,
  380. log_requests: bool = True,
  381. start_engine_loop: bool = True,
  382. **kwargs) -> None:
  383. self.log_requests = log_requests
  384. self.engine = self._engine_class(*args, **kwargs)
  385. # This ensures quick processing of request outputs
  386. # so the append to asyncio queues is not delayed,
  387. # especially for multi-step.
  388. self.use_process_request_outputs_callback = (
  389. self.engine.model_config.use_async_output_proc)
  390. if self.use_process_request_outputs_callback:
  391. self.engine.process_request_outputs_callback = \
  392. weak_bind(self.process_request_outputs)
  393. self.background_loop: Optional[asyncio.Future] = None
  394. # We need to keep a reference to unshielded
  395. # task as well to prevent it from being garbage
  396. # collected
  397. self._background_loop_unshielded: Optional[asyncio.Task] = None
  398. self.start_engine_loop = start_engine_loop
  399. self._errored_with: Optional[BaseException] = None
  400. # Lazy initialized fields
  401. self._request_tracker: RequestTracker
  402. def __del__(self):
  403. if rt := getattr(self, "request_tracker", None):
  404. # Wake up engine loop so that it will exit cleanly
  405. rt.new_requests_event.set()
  406. @classmethod
  407. def _get_executor_cls(
  408. cls, engine_config: EngineConfig) -> Type[ExecutorAsyncBase]:
  409. distributed_executor_backend = (
  410. engine_config.parallel_config.distributed_executor_backend)
  411. if isinstance(distributed_executor_backend, type):
  412. if not issubclass(distributed_executor_backend, ExecutorAsyncBase):
  413. raise TypeError(
  414. "distributed_executor_backend must be a subclass of "
  415. f"ExecutorAsyncBase. Got {distributed_executor_backend}.")
  416. executor_class = distributed_executor_backend
  417. elif engine_config.device_config.device_type == "neuron":
  418. from aphrodite.executor.neuron_executor import NeuronExecutorAsync
  419. executor_class = NeuronExecutorAsync
  420. elif engine_config.device_config.device_type == "tpu":
  421. if distributed_executor_backend == "ray":
  422. from aphrodite.executor.ray_tpu_executor import (
  423. RayTPUExecutorAsync)
  424. executor_class = RayTPUExecutorAsync
  425. else:
  426. assert distributed_executor_backend is None
  427. from aphrodite.executor.tpu_executor import TPUExecutorAsync
  428. executor_class = TPUExecutorAsync
  429. elif engine_config.device_config.device_type == "cpu":
  430. from aphrodite.executor.cpu_executor import CPUExecutorAsync
  431. executor_class = CPUExecutorAsync
  432. elif engine_config.device_config.device_type == "openvino":
  433. assert distributed_executor_backend is None, (
  434. "Distributed execution is not supported with "
  435. "the OpenVINO backend.")
  436. from aphrodite.executor.openvino_executor import (
  437. OpenVINOExecutorAsync)
  438. executor_class = OpenVINOExecutorAsync
  439. elif engine_config.device_config.device_type == "xpu":
  440. if distributed_executor_backend is None:
  441. from aphrodite.executor.xpu_executor import XPUExecutorAsync
  442. executor_class = XPUExecutorAsync
  443. elif distributed_executor_backend == "ray":
  444. from aphrodite.executor.ray_xpu_executor import (
  445. RayXPUExecutorAsync)
  446. executor_class = RayXPUExecutorAsync
  447. elif distributed_executor_backend == "mp":
  448. from aphrodite.executor.multiproc_xpu_executor import (
  449. MultiprocessingXPUExecutorAsync)
  450. executor_class = MultiprocessingXPUExecutorAsync
  451. else:
  452. raise RuntimeError(
  453. "Not supported distributed execution model on XPU device.")
  454. elif distributed_executor_backend == "ray":
  455. from aphrodite.executor.ray_gpu_executor import RayGPUExecutorAsync
  456. executor_class = RayGPUExecutorAsync
  457. elif distributed_executor_backend == "mp":
  458. from aphrodite.executor.multiproc_gpu_executor import (
  459. MultiprocessingGPUExecutorAsync)
  460. executor_class = MultiprocessingGPUExecutorAsync
  461. else:
  462. from aphrodite.executor.gpu_executor import GPUExecutorAsync
  463. executor_class = GPUExecutorAsync
  464. return executor_class
  465. @classmethod
  466. def from_engine_args(
  467. cls,
  468. engine_args: AsyncEngineArgs,
  469. engine_config: Optional[EngineConfig] = None,
  470. start_engine_loop: bool = True,
  471. stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
  472. ) -> "AsyncAphrodite":
  473. """Creates an async LLM engine from the engine arguments."""
  474. # Create the engine configs.
  475. if engine_config is None:
  476. engine_config = engine_args.create_engine_config()
  477. executor_class = cls._get_executor_cls(engine_config)
  478. if executor_class.uses_ray:
  479. initialize_ray_cluster(engine_config.parallel_config)
  480. # Create the async LLM engine.
  481. engine = cls(
  482. **engine_config.to_dict(),
  483. executor_class=executor_class,
  484. log_requests=not engine_args.disable_log_requests,
  485. log_stats=not engine_args.disable_log_stats,
  486. start_engine_loop=start_engine_loop,
  487. stat_loggers=stat_loggers,
  488. )
  489. return engine
  490. @property
  491. def is_running(self) -> bool:
  492. return (self.background_loop is not None
  493. and self._background_loop_unshielded is not None
  494. and not self._background_loop_unshielded.done())
  495. @property
  496. def is_stopped(self) -> bool:
  497. return self.errored or (self.background_loop is not None and
  498. self._background_loop_unshielded is not None
  499. and self._background_loop_unshielded.done())
  500. @property
  501. def errored(self) -> bool:
  502. return self._errored_with is not None
  503. @property
  504. def dead_error(self) -> BaseException:
  505. return AsyncEngineDeadError(
  506. "Background loop is not running. If it was running, "
  507. "inspect the output to find the stacktrace of the "
  508. "error that caused the background loop to stop "
  509. "(AsyncEngineDeadError).")
  510. def set_errored(self, exc: Exception) -> None:
  511. self._errored_with = exc
  512. def _error_callback(self, exc: Exception) -> None:
  513. self.set_errored(exc)
  514. self._request_tracker.propagate_exception(exc)
  515. async def get_tokenizer(
  516. self,
  517. lora_request: Optional[LoRARequest] = None,
  518. ) -> AnyTokenizer:
  519. return await (self.engine.get_tokenizer_group().
  520. get_lora_tokenizer_async(lora_request))
  521. def start_background_loop(self) -> None:
  522. """Start the background loop."""
  523. if self.errored:
  524. raise AsyncEngineDeadError(
  525. "Background loop has errored already.") from self._errored_with
  526. if self.is_running:
  527. raise RuntimeError("Background loop is already running.")
  528. # Initialize the RequestTracker here so it uses the right event loop.
  529. self._request_tracker = RequestTracker()
  530. self._background_loop_unshielded = asyncio.get_event_loop(
  531. ).create_task(self.run_engine_loop(weakref.ref(self)))
  532. self._background_loop_unshielded.add_done_callback(
  533. partial(_log_task_completion, error_callback=self._error_callback))
  534. self.background_loop = asyncio.shield(self._background_loop_unshielded)
  535. def shutdown_background_loop(self) -> None:
  536. """
  537. Shut down the background loop.
  538. This method needs to be called during cleanup to remove
  539. references to `self` and properly GC the resources held
  540. by the async LLM engine (e.g., the executors as well as
  541. their resources).
  542. """
  543. if self._background_loop_unshielded is not None:
  544. self._background_loop_unshielded.cancel()
  545. self._background_loop_unshielded = None
  546. self.background_loop = None
  547. async def engine_step(self, virtual_engine: int) -> bool:
  548. """Kick the engine to process the waiting requests.
  549. Returns True if there are in-progress requests."""
  550. new_requests, aborted_requests = (
  551. self._request_tracker.get_new_and_aborted_requests())
  552. for new_request in new_requests:
  553. # Add the request into the Aphrodite engine's waiting queue.
  554. try:
  555. await self.engine.add_request_async(**new_request)
  556. except ValueError as e:
  557. # TODO: use an Aphrodite specific error for failed validation
  558. self._request_tracker.process_exception(
  559. new_request["request_id"],
  560. e,
  561. verbose=self.log_requests,
  562. )
  563. if aborted_requests:
  564. await self._engine_abort(aborted_requests)
  565. request_outputs = await self.engine.step_async(virtual_engine)
  566. # Put the outputs into the corresponding streams.
  567. # If used as a callback, then already invoked inside
  568. # LLMEngine's _process_model_outputs
  569. if not self.use_process_request_outputs_callback:
  570. all_finished = self.process_request_outputs(request_outputs)
  571. else:
  572. # For callback case, we only need to detect when all
  573. # requests are finished
  574. all_finished = all(request_output.finished
  575. for request_output in request_outputs)
  576. return not all_finished
  577. def process_request_outputs(self, request_outputs) -> bool:
  578. # Put the outputs into the corresponding streams.
  579. all_finished = True
  580. for request_output in request_outputs:
  581. self._request_tracker.process_request_output(
  582. request_output, verbose=self.log_requests)
  583. all_finished = all_finished and request_output.finished
  584. return all_finished
  585. async def _engine_abort(self, request_ids: Iterable[str]):
  586. self.engine.abort_request(request_ids)
  587. @staticmethod
  588. async def run_engine_loop(engine_ref: ReferenceType):
  589. """We use a weakref to the engine so that the running loop
  590. doesn't prevent the engine being garbage collected."""
  591. engine: Optional["AsyncAphrodite"] = engine_ref()
  592. if not engine:
  593. return
  594. pipeline_parallel_size = \
  595. engine.engine.parallel_config.pipeline_parallel_size
  596. has_requests_in_progress = [False] * pipeline_parallel_size
  597. while True:
  598. if not any(has_requests_in_progress):
  599. logger.debug("Waiting for new requests...")
  600. # Stop the execute model loop in parallel workers until there
  601. # are more requests to process. This avoids waiting
  602. # indefinitely in torch.distributed ops which may otherwise
  603. # timeout, and unblocks the RPC thread in the workers so that
  604. # they can process any other queued control plane messages,
  605. # such as add/remove lora adapters.
  606. await engine.engine.stop_remote_worker_execution_loop_async()
  607. request_tracker = engine._request_tracker
  608. # Allow engine to be garbage collected while
  609. # waiting for new requests
  610. del engine
  611. await asyncio.sleep(0)
  612. if engine_ref() is None:
  613. return
  614. await request_tracker.wait_for_new_requests()
  615. engine = engine_ref()
  616. if not engine:
  617. return
  618. logger.debug("Got new requests!")
  619. requests_in_progress = [
  620. asyncio.create_task(engine.engine_step(ve))
  621. for ve in range(pipeline_parallel_size)
  622. ]
  623. has_requests_in_progress = [True] * pipeline_parallel_size
  624. # Abort if iteration takes too long due to unrecoverable errors
  625. # (eg. NCCL timeouts).
  626. try:
  627. async with asyncio_timeout(ENGINE_ITERATION_TIMEOUT_S):
  628. done, _ = await asyncio.wait(
  629. requests_in_progress,
  630. return_when=asyncio.FIRST_COMPLETED)
  631. for _ in range(pipeline_parallel_size):
  632. await asyncio.sleep(0)
  633. for task in done:
  634. result = task.result()
  635. virtual_engine = requests_in_progress.index(task)
  636. has_unfinished_requests = (
  637. engine.engine.
  638. has_unfinished_requests_for_virtual_engine(
  639. virtual_engine))
  640. if result or has_unfinished_requests:
  641. requests_in_progress[virtual_engine] = (
  642. asyncio.create_task(
  643. engine.engine_step(virtual_engine)))
  644. has_requests_in_progress[virtual_engine] = True
  645. else:
  646. has_requests_in_progress[virtual_engine] = False
  647. except asyncio.TimeoutError as exc:
  648. logger.error(
  649. "Engine iteration timed out. This should never happen!")
  650. engine.set_errored(exc)
  651. raise
  652. await asyncio.sleep(0)
  653. # This method does not need to be async, but kept that way
  654. # for backwards compatibility.
  655. async def add_request(
  656. self,
  657. request_id: str,
  658. inputs: PromptInputs,
  659. params: Union[SamplingParams, PoolingParams],
  660. arrival_time: Optional[float] = None,
  661. lora_request: Optional[LoRARequest] = None,
  662. prompt_adapter_request: Optional[PromptAdapterRequest] = None
  663. ) -> AsyncGenerator[Union[RequestOutput, EmbeddingRequestOutput], None]:
  664. if not self.is_running:
  665. if self.start_engine_loop:
  666. self.start_background_loop()
  667. else:
  668. raise AsyncEngineDeadError(
  669. "Background loop is not running. If it was running, "
  670. "inspect the output to find the stacktrace of the "
  671. "error that caused the background loop to stop "
  672. "(AsyncEngineDeadError).")
  673. stream = self._request_tracker.add_request(
  674. request_id,
  675. verbose=self.log_requests,
  676. inputs=inputs,
  677. params=params,
  678. arrival_time=arrival_time or time.time(),
  679. lora_request=lora_request,
  680. prompt_adapter_request=prompt_adapter_request)
  681. return stream.generator()
  682. async def generate(
  683. self,
  684. inputs: PromptInputs,
  685. sampling_params: SamplingParams,
  686. request_id: str,
  687. lora_request: Optional[LoRARequest] = None,
  688. prompt_adapter_request: Optional[PromptAdapterRequest] = None
  689. ) -> AsyncGenerator[RequestOutput, None]:
  690. """Generate outputs for a request.
  691. Generate outputs for a request. This method is a coroutine. It adds the
  692. request into the waiting queue of the AphroditeEngine and streams the
  693. outputs from the AphroditeEngine to the caller.
  694. Args:
  695. inputs: The inputs to the LLM. See
  696. :class:`~aphrodite.inputs.PromptInputs`
  697. for more details about the format of each input.
  698. sampling_params: The sampling parameters of the request.
  699. request_id: The unique id of the request.
  700. lora_request: LoRA request to use for generation, if any.
  701. prompt_adapter_request: Prompt Adapter request to use
  702. for generation, if any.
  703. Yields:
  704. The output `RequestOutput` objects from the AphroditeEngine
  705. for the request.
  706. Details:
  707. - If the engine is not running, start the background loop,
  708. which iteratively invokes
  709. # pylint: disable=line-too-long
  710. :meth:`~aphrodite.engine.async_aphrodite.AsyncAphrodite.engine_step`
  711. to process the waiting requests.
  712. - Add the request to the engine's `RequestTracker`.
  713. On the next background loop, this request will be sent to
  714. the underlying engine.
  715. Also, a corresponding `AsyncStream` will be created.
  716. - Wait for the request outputs from `AsyncStream` and yield them.
  717. Example:
  718. >>> # Please refer to entrypoints/api_server.py for
  719. >>> # the complete example.
  720. >>>
  721. >>> # initialize the engine and the example input
  722. >>> engine = AsyncAphrodite.from_engine_args(engine_args)
  723. >>> example_input = {
  724. >>> "prompt": "What is LLM?",
  725. >>> "stream": False, # assume the non-streaming case
  726. >>> "temperature": 0.0,
  727. >>> "request_id": 0,
  728. >>> }
  729. >>>
  730. >>> # start the generation
  731. >>> results_generator = engine.generate(
  732. >>> example_input["prompt"],
  733. >>> SamplingParams(temperature=example_input["temperature"]),
  734. >>> example_input["request_id"])
  735. >>>
  736. >>> # get the results
  737. >>> final_output = None
  738. >>> async for request_output in results_generator:
  739. >>> if await request.is_disconnected():
  740. >>> # Abort the request if the client disconnects.
  741. >>> await engine.abort(request_id)
  742. >>> # Return or raise an error
  743. >>> ...
  744. >>> final_output = request_output
  745. >>>
  746. >>> # Process and return the final output
  747. >>> ...
  748. """
  749. async for output in await self.add_request(
  750. request_id,
  751. inputs,
  752. sampling_params,
  753. lora_request=lora_request,
  754. prompt_adapter_request=prompt_adapter_request,
  755. ):
  756. yield AphroditeEngine.validate_output(output, RequestOutput)
  757. async def encode(
  758. self,
  759. inputs: PromptInputs,
  760. pooling_params: PoolingParams,
  761. request_id: str,
  762. lora_request: Optional[LoRARequest] = None,
  763. ) -> AsyncGenerator[EmbeddingRequestOutput, None]:
  764. """Generate outputs for a request from an embedding model.
  765. Generate outputs for a request. This method is a coroutine. It adds the
  766. request into the waiting queue of the AphroditeEngine and streams the
  767. outputs from the AphroditeEngine to the caller.
  768. Args:
  769. inputs: The inputs to the LLM. See
  770. :class:`~aphrodite.inputs.PromptInputs`
  771. for more details about the format of each input.
  772. pooling_params: The pooling parameters of the request.
  773. request_id: The unique id of the request.
  774. lora_request: LoRA request to use for generation, if any.
  775. Yields:
  776. The output `EmbeddingRequestOutput` objects from the AphroditeEngine
  777. for the request.
  778. Details:
  779. - If the engine is not running, start the background loop,
  780. which iteratively invokes
  781. :meth:`~aphrodite.engine.async_aphrodite.AsyncAphrodite.engine_step`
  782. to process the waiting requests.
  783. - Add the request to the engine's `RequestTracker`.
  784. On the next background loop, this request will be sent to
  785. the underlying engine.
  786. Also, a corresponding `AsyncStream` will be created.
  787. - Wait for the request outputs from `AsyncStream` and yield them.
  788. Example:
  789. >>> # Please refer to endpoints/api_server.py for
  790. >>> # the complete example.
  791. >>>
  792. >>> # initialize the engine and the example input
  793. >>> engine = AsyncAphrodite.from_engine_args(engine_args)
  794. >>> example_input = {
  795. >>> "input": "What is LLM?",
  796. >>> "request_id": 0,
  797. >>> }
  798. >>>
  799. >>> # start the generation
  800. >>> results_generator = engine.encode(
  801. >>> example_input["input"],
  802. >>> PoolingParams(),
  803. >>> example_input["request_id"])
  804. >>>
  805. >>> # get the results
  806. >>> final_output = None
  807. >>> async for request_output in results_generator:
  808. >>> if await request.is_disconnected():
  809. >>> # Abort the request if the client disconnects.
  810. >>> await engine.abort(request_id)
  811. >>> # Return or raise an error
  812. >>> ...
  813. >>> final_output = request_output
  814. >>>
  815. >>> # Process and return the final output
  816. >>> ...
  817. """
  818. async for output in await self.add_request(
  819. request_id,
  820. inputs,
  821. pooling_params,
  822. lora_request=lora_request,
  823. ):
  824. yield AphroditeEngine.validate_output(output,
  825. EmbeddingRequestOutput)
  826. async def abort(self, request_id: str) -> None:
  827. """Abort a request.
  828. Abort a submitted request. If the request is finished or not found,
  829. this method will be a no-op.
  830. Args:
  831. request_id: The unique id of the request.
  832. """
  833. if not self.is_running:
  834. raise AsyncEngineDeadError(
  835. "Background loop is not running. If it was running, "
  836. "inspect the output to find the stacktrace of the "
  837. "error that caused the background loop to stop "
  838. "(AsyncEngineDeadError).")
  839. return self._abort(request_id)
  840. def _abort(self, request_id: str) -> None:
  841. """Abort a request.
  842. Abort a submitted request. If the request is finished or not found,
  843. this method will be a no-op.
  844. Args:
  845. request_id: The unique id of the request.
  846. """
  847. self._request_tracker.abort_request(request_id,
  848. exception=asyncio.CancelledError,
  849. verbose=self.log_requests)
  850. async def get_model_config(self) -> ModelConfig:
  851. """Get the model configuration of the Aphrodite engine."""
  852. return self.engine.get_model_config()
  853. async def get_parallel_config(self) -> ParallelConfig:
  854. """Get the parallel configuration of the Aphrodite engine."""
  855. return self.engine.get_parallel_config()
  856. async def get_decoding_config(self) -> DecodingConfig:
  857. """Get the decoding configuration of the Aphrodite engine."""
  858. return self.engine.get_decoding_config()
  859. async def get_scheduler_config(self) -> SchedulerConfig:
  860. """Get the scheduling configuration of the Aphrodite engine."""
  861. return self.engine.get_scheduler_config()
  862. async def get_lora_config(self) -> LoRAConfig:
  863. """Get the lora configuration of the Aphrodite engine."""
  864. return self.engine.get_lora_config()
  865. async def do_log_stats(
  866. self,
  867. scheduler_outputs: Optional[SchedulerOutputs] = None,
  868. model_output: Optional[List[SamplerOutput]] = None) -> None:
  869. self.engine.do_log_stats()
  870. async def check_health(self) -> None:
  871. """Raises an error if engine is unhealthy."""
  872. t = time.perf_counter()
  873. logger.debug("Starting health check...")
  874. if self.is_stopped:
  875. raise AsyncEngineDeadError("Background loop is stopped.")
  876. await self.engine.check_health_async()
  877. logger.debug(f"Health check took {time.perf_counter() - t}s")
  878. def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None:
  879. self.engine.add_logger(logger_name=logger_name, logger=logger)
  880. def remove_logger(self, logger_name: str) -> None:
  881. self.engine.remove_logger(logger_name=logger_name)