пре 6 месеци · 77c4fbd5c9
--- a/aphrodite/common/utils.py
+++ b/aphrodite/common/utils.py
@@ -1,5 +1,6 @@
 
				 import argparse
			
 
				 import asyncio
			
 
				+import contextlib
			
 
				 import datetime
			
 
				 import enum
			
 
				 import gc
			
@@ -11,10 +12,11 @@ import tempfile
 
				 import threading
			
 
				 import uuid
			
 
				 import warnings
			
 
				+from asyncio import FIRST_COMPLETED, ensure_future
			
 
				 from collections import defaultdict
			
 
				 from functools import lru_cache, partial, wraps
			
 
				 from platform import uname
			
 
				-from typing import (Any, AsyncIterator, Awaitable, Callable, Dict, Generic,
			
 
				+from typing import (Any, AsyncGenerator, Awaitable, Callable, Dict, Generic,
			
 
				                     Hashable, List, Optional, OrderedDict, Set, Tuple, TypeVar,
			
 
				                     Union, overload)
			
 
				 
			
@@ -372,63 +374,72 @@ def make_async(func: Callable[P, T]) -> Callable[P, Awaitable[T]]:
 
				     return _async_wrapper
			
 
				 
			
 
				 
			
 
				-class ProducerFinished:
			
 
				-    pass
			
 
				+async def iterate_with_cancellation(
			
 
				+    iterator: AsyncGenerator[T, None],
			
 
				+    is_cancelled: Callable[[], Awaitable[bool]],
			
 
				+) -> AsyncGenerator[T, None]:
			
 
				+    """Convert async iterator into one that polls the provided function
			
 
				+    at least once per second to check for client cancellation.
			
 
				+    """
			
 
				 
			
 
				+    # Can use anext() in python >= 3.10
			
 
				+    awaits = [ensure_future(iterator.__anext__())]
			
 
				+    while True:
			
 
				+        done, pending = await asyncio.wait(awaits, timeout=1)
			
 
				+        if await is_cancelled():
			
 
				+            with contextlib.suppress(BaseException):
			
 
				+                awaits[0].cancel()
			
 
				+                await iterator.aclose()
			
 
				+            raise asyncio.CancelledError("client cancelled")
			
 
				+        if done:
			
 
				+            try:
			
 
				+                item = await awaits[0]
			
 
				+                awaits[0] = ensure_future(iterator.__anext__())
			
 
				+                yield item
			
 
				+            except StopAsyncIteration:
			
 
				+                # we are done
			
 
				+                return
			
 
				 
			
 
				-def merge_async_iterators(
			
 
				-        *iterators: AsyncIterator[T]) -> AsyncIterator[Tuple[int, T]]:
			
 
				-    """Merge multiple asynchronous iterators into a single iterator.
			
 
				 
			
 
				+async def merge_async_iterators(
			
 
				+    *iterators: AsyncGenerator[T, None],
			
 
				+    is_cancelled: Callable[[], Awaitable[bool]],
			
 
				+) -> AsyncGenerator[Tuple[int, T], None]:
			
 
				+    """Merge multiple asynchronous iterators into a single iterator.
			
 
				     This method handle the case where some iterators finish before others.
			
 
				     When it yields, it yields a tuple (i, item) where i is the index of the
			
 
				     iterator that yields the item.
			
 
				+    It also polls the provided function at least once per second to check
			
 
				+    for client cancellation.
			
 
				     """
			
 
				-    queue: asyncio.Queue[Union[Tuple[int, T], ProducerFinished,
			
 
				-                               Exception]] = asyncio.Queue()
			
 
				-
			
 
				-    producers = len(iterators)
			
 
				-
			
 
				-    async def producer(i: int, iterator: AsyncIterator[T]):
			
 
				-        try:
			
 
				-            async for item in iterator:
			
 
				-                await queue.put((i, item))
			
 
				-        except Exception as e:
			
 
				-            await queue.put(e)
			
 
				-        # Signal to the consumer that we've finished
			
 
				-        await queue.put(ProducerFinished())
			
 
				-
			
 
				-    _tasks = [
			
 
				-        asyncio.create_task(producer(i, iterator))
			
 
				-        for i, iterator in enumerate(iterators)
			
 
				-    ]
			
 
				 
			
 
				-    async def consumer():
			
 
				-        remaining = producers
			
 
				-        try:
			
 
				-            while remaining or not queue.empty():
			
 
				-                # we think there is a race condition here
			
 
				-                item = await queue.get()
			
 
				-
			
 
				-                if isinstance(item, ProducerFinished):
			
 
				-                    # Signal that a producer finished- not a real item
			
 
				-                    remaining -= 1
			
 
				-                    continue
			
 
				-
			
 
				-                if isinstance(item, Exception):
			
 
				-                    raise item
			
 
				-                yield item
			
 
				-        except (Exception, asyncio.CancelledError) as e:
			
 
				-            for task in _tasks:
			
 
				-                if sys.version_info >= (3, 9):
			
 
				-                    # msg parameter only supported in Python 3.9+
			
 
				-                    task.cancel(e)
			
 
				-                else:
			
 
				-                    task.cancel()
			
 
				-            raise e
			
 
				-        await asyncio.gather(*_tasks)
			
 
				-
			
 
				-    return consumer()
			
 
				+    # Can use anext() in python >= 3.10
			
 
				+    awaits = {
			
 
				+        ensure_future(pair[1].__anext__()): pair
			
 
				+        for pair in enumerate(iterators)
			
 
				+    }
			
 
				+    try:
			
 
				+        while awaits:
			
 
				+            done, pending = await asyncio.wait(awaits.keys(),
			
 
				+                                               return_when=FIRST_COMPLETED,
			
 
				+                                               timeout=1)
			
 
				+            if await is_cancelled():
			
 
				+                raise asyncio.CancelledError("client cancelled")
			
 
				+            for d in done:
			
 
				+                pair = awaits.pop(d)
			
 
				+                try:
			
 
				+                    item = await d
			
 
				+                    i, it = pair
			
 
				+                    awaits[ensure_future(it.__anext__())] = pair
			
 
				+                    yield i, item
			
 
				+                except StopAsyncIteration:
			
 
				+                    pass
			
 
				+    finally:
			
 
				+        # Cancel any remaining iterators
			
 
				+        for f, (_, it) in awaits.items():
			
 
				+            with contextlib.suppress(BaseException):
			
 
				+                f.cancel()
			
 
				+                await it.aclose()
			
 
				 
			
 
				 
			
 
				 def get_ip() -> str:
			
--- a/aphrodite/endpoints/openai/rpc/client.py
+++ b/aphrodite/endpoints/openai/rpc/client.py
@@ -1,5 +1,5 @@
 
				 from contextlib import contextmanager
			
 
				-from typing import Any, AsyncIterator, Optional
			
 
				+from typing import Any, AsyncGenerator, Optional
			
 
				 
			
 
				 import cloudpickle
			
 
				 import zmq
			
@@ -179,35 +179,37 @@ class AsyncEngineRPCClient:
 
				         request_id: str,
			
 
				         lora_request: Optional[LoRARequest] = None,
			
 
				         prompt_adapter_request: Optional[PromptAdapterRequest] = None
			
 
				-    ) -> AsyncIterator[RequestOutput]:
			
 
				+    ) -> AsyncGenerator[RequestOutput, None]:
			
 
				         """Send an RPCGenerateRequest to the RPCServer and stream responses."""
			
 
				 
			
 
				-        with self.socket() as socket:
			
 
				-
			
 
				-            # Send RPCGenerateRequest to the RPCServer.
			
 
				-            await socket.send_multipart([
			
 
				-                cloudpickle.dumps(
			
 
				-                    RPCGenerateRequest(
			
 
				-                        inputs=inputs,
			
 
				-                        sampling_params=sampling_params,
			
 
				-                        request_id=request_id,
			
 
				-                        lora_request=lora_request,
			
 
				-                        prompt_adapter_request=prompt_adapter_request))
			
 
				-            ])
			
 
				-
			
 
				-            # Stream back the results from the RPC Server.
			
 
				-            while True:
			
 
				-                message = await socket.recv()
			
 
				-                request_output = cloudpickle.loads(message)
			
 
				-
			
 
				-                if isinstance(request_output, Exception):
			
 
				-                    raise request_output
			
 
				-
			
 
				-                if request_output.finished:
			
 
				-                    break
			
 
				-                yield request_output
			
 
				-
			
 
				-            yield request_output
			
 
				+        finished = False
			
 
				+        try:
			
 
				+            with self.socket() as socket:
			
 
				+
			
 
				+                # Send RPCGenerateRequest to the RPCServer.
			
 
				+                await socket.send_multipart([
			
 
				+                    cloudpickle.dumps(
			
 
				+                        RPCGenerateRequest(
			
 
				+                            inputs=inputs,
			
 
				+                            sampling_params=sampling_params,
			
 
				+                            request_id=request_id,
			
 
				+                            lora_request=lora_request,
			
 
				+                            prompt_adapter_request=prompt_adapter_request))
			
 
				+                ])
			
 
				+
			
 
				+                # Stream back the results from the RPC Server.
			
 
				+                while not finished:
			
 
				+                    message = await socket.recv()
			
 
				+                    request_output = cloudpickle.loads(message)
			
 
				+
			
 
				+                    if isinstance(request_output, Exception):
			
 
				+                        raise request_output
			
 
				+
			
 
				+                    finished = request_output.finished
			
 
				+                    yield request_output
			
 
				+        finally:
			
 
				+            if not finished:
			
 
				+                await self.abort(request_id)
			
 
				 
			
 
				     async def check_health(self) -> None:
			
 
				         """Raise if unhealthy"""
			
@@ -231,6 +233,6 @@ class AsyncEngineRPCClient:
 
				                              f"{health_message}")
			
 
				 
			
 
				     async def encode(self, *args,
			
 
				-                     **kwargs) -> AsyncIterator[EmbeddingRequestOutput]:
			
 
				+                     **kwargs) -> AsyncGenerator[EmbeddingRequestOutput, None]:
			
 
				         raise NotImplementedError(
			
 
				             "Embeddings not supported with multiprocessing backend")
			
--- a/aphrodite/endpoints/openai/serving_chat.py
+++ b/aphrodite/endpoints/openai/serving_chat.py
@@ -1,3 +1,4 @@
 
				+import asyncio
			
 
				 import time
			
 
				 from typing import AsyncGenerator, AsyncIterator, Dict, List, Optional
			
 
				 from typing import Sequence as GenericSequence
			
@@ -10,7 +11,7 @@ from transformers import PreTrainedTokenizer
 
				 from aphrodite.common.config import ModelConfig
			
 
				 from aphrodite.common.outputs import RequestOutput
			
 
				 from aphrodite.common.sequence import Logprob
			
 
				-from aphrodite.common.utils import random_uuid
			
 
				+from aphrodite.common.utils import iterate_with_cancellation, random_uuid
			
 
				 from aphrodite.endpoints.chat_utils import (ConversationMessage,
			
 
				                                             load_chat_template,
			
 
				                                             parse_chat_messages)
			
@@ -160,18 +161,20 @@ class OpenAIServingChat(OpenAIServing):
 
				             # TODO: Use an aphrodite-specific Validation Error
			
 
				             return self.create_error_response(str(e))
			
 
				 
			
 
				+        if raw_request:
			
 
				+            result_generator = iterate_with_cancellation(
			
 
				+                result_generator, raw_request.is_disconnected)
			
 
				+
			
 
				         # Streaming response
			
 
				         if request.stream:
			
 
				             return self.chat_completion_stream_generator(
			
 
				                 request, result_generator, request_id, conversation, tokenizer)
			
 
				-        else:
			
 
				-            try:
			
 
				-                return await self.chat_completion_full_generator(
			
 
				-                    request, raw_request, result_generator, request_id,
			
 
				-                    conversation, tokenizer)
			
 
				-            except ValueError as e:
			
 
				-                # TODO: Use an aphrodite-specific Validation Error
			
 
				-                return self.create_error_response(str(e))
			
 
				+        try:
			
 
				+            return await self.chat_completion_full_generator(
			
 
				+                request, result_generator, request_id, conversation, tokenizer)
			
 
				+        except ValueError as e:
			
 
				+            # TODO: Use an aphrodite-specific Validation Error
			
 
				+            return self.create_error_response(str(e))
			
 
				 
			
 
				     def get_chat_request_role(self, request: ChatCompletionRequest) -> str:
			
 
				         if request.add_generation_prompt:
			
@@ -402,7 +405,6 @@ class OpenAIServingChat(OpenAIServing):
 
				     async def chat_completion_full_generator(
			
 
				         self,
			
 
				         request: ChatCompletionRequest,
			
 
				-        raw_request: Optional[Request],
			
 
				         result_generator: AsyncIterator[RequestOutput],
			
 
				         request_id: str,
			
 
				         conversation: List[ConversationMessage],
			
@@ -413,12 +415,12 @@ class OpenAIServingChat(OpenAIServing):
 
				         created_time = int(time.time())
			
 
				         final_res: Optional[RequestOutput] = None
			
 
				 
			
 
				-        async for res in result_generator:
			
 
				-            if raw_request is not None and await raw_request.is_disconnected():
			
 
				-                # Abort the request if the client disconnects.
			
 
				-                await self.async_engine_client.abort(request_id)
			
 
				-                return self.create_error_response("Client disconnected")
			
 
				-            final_res = res
			
 
				+        try:
			
 
				+            async for res in result_generator:
			
 
				+                final_res = res
			
 
				+        except asyncio.CancelledError:
			
 
				+            return self.create_error_response("Client disconnected")
			
 
				+
			
 
				         assert final_res is not None
			
 
				 
			
 
				         choices: List[ChatCompletionResponseChoice] = []
			
--- a/aphrodite/endpoints/openai/serving_completions.py
+++ b/aphrodite/endpoints/openai/serving_completions.py
@@ -1,3 +1,4 @@
 
				+import asyncio
			
 
				 import time
			
 
				 from typing import (AsyncGenerator, AsyncIterator, Callable, Dict, List,
			
 
				                     Optional)
			
@@ -76,7 +77,7 @@ class OpenAIServingCompletion(OpenAIServing):
 
				         created_time = int(time.time())
			
 
				 
			
 
				         # Schedule the request and get the result generator.
			
 
				-        generators: List[AsyncIterator[RequestOutput]] = []
			
 
				+        generators: List[AsyncGenerator[RequestOutput, None]] = []
			
 
				         try:
			
 
				             (
			
 
				                 lora_request,
			
@@ -126,7 +127,8 @@ class OpenAIServingCompletion(OpenAIServing):
 
				             return self.create_error_response(str(e))
			
 
				 
			
 
				         result_generator: AsyncIterator[Tuple[
			
 
				-            int, RequestOutput]] = merge_async_iterators(*generators)
			
 
				+            int, RequestOutput]] = merge_async_iterators(
			
 
				+                *generators, is_cancelled=raw_request.is_disconnected)
			
 
				 
			
 
				         # Similar to the OpenAI API, when n != best_of, we do not stream the
			
 
				         # results. In addition, we do not stream the results when use
			
@@ -138,7 +140,6 @@ class OpenAIServingCompletion(OpenAIServing):
 
				         # Streaming response
			
 
				         if stream:
			
 
				             return self.completion_stream_generator(request,
			
 
				-                                                    raw_request,
			
 
				                                                     result_generator,
			
 
				                                                     request_id,
			
 
				                                                     created_time,
			
@@ -150,10 +151,6 @@ class OpenAIServingCompletion(OpenAIServing):
 
				         final_res_batch: List[Optional[RequestOutput]] = [None] * len(prompts)
			
 
				         try:
			
 
				             async for i, res in result_generator:
			
 
				-                if await raw_request.is_disconnected():
			
 
				-                    # Abort the request if the client disconnects.
			
 
				-                    await self.async_engine_client.abort(f"{request_id}-{i}")
			
 
				-                    return self.create_error_response("Client disconnected")
			
 
				                 final_res_batch[i] = res
			
 
				 
			
 
				             for i, final_res in enumerate(final_res_batch):
			
@@ -175,6 +172,8 @@ class OpenAIServingCompletion(OpenAIServing):
 
				                 model_name,
			
 
				                 tokenizer,
			
 
				             )
			
 
				+        except asyncio.CancelledError:
			
 
				+            return self.create_error_response("Client disconnected")
			
 
				         except ValueError as e:
			
 
				             # TODO: Use an aphrodite-specific Validation Error
			
 
				             return self.create_error_response(str(e))
			
@@ -195,7 +194,6 @@ class OpenAIServingCompletion(OpenAIServing):
 
				     async def completion_stream_generator(
			
 
				         self,
			
 
				         request: CompletionRequest,
			
 
				-        raw_request: Request,
			
 
				         result_generator: AsyncIterator[Tuple[int, RequestOutput]],
			
 
				         request_id: str,
			
 
				         created_time: int,
			
@@ -211,12 +209,6 @@ class OpenAIServingCompletion(OpenAIServing):
 
				         try:
			
 
				             async for prompt_idx, res in result_generator:
			
 
				 
			
 
				-                # Abort the request if the client disconnects.
			
 
				-                if await raw_request.is_disconnected():
			
 
				-                    await self.async_engine_client.abort(
			
 
				-                        f"{request_id}-{prompt_idx}")
			
 
				-                    raise StopAsyncIteration()
			
 
				-
			
 
				                 for output in res.outputs:
			
 
				                     i = output.index + prompt_idx * num_choices
			
 
				                     # TODO: optimize the performance by avoiding full
			
--- a/aphrodite/endpoints/openai/serving_embedding.py
+++ b/aphrodite/endpoints/openai/serving_embedding.py
@@ -1,6 +1,7 @@
 
				+import asyncio
			
 
				 import base64
			
 
				 import time
			
 
				-from typing import AsyncIterator, List, Optional, Tuple, cast
			
 
				+from typing import AsyncGenerator, AsyncIterator, List, Optional, Tuple, cast
			
 
				 
			
 
				 import numpy as np
			
 
				 from fastapi import Request
			
@@ -91,7 +92,7 @@ class OpenAIServingEmbedding(OpenAIServing):
 
				         created_time = int(time.monotonic())
			
 
				 
			
 
				         # Schedule the request and get the result generator.
			
 
				-        generators: List[AsyncIterator[EmbeddingRequestOutput]] = []
			
 
				+        generators: List[AsyncGenerator[EmbeddingRequestOutput, None]] = []
			
 
				         try:
			
 
				             (
			
 
				                 lora_request,
			
@@ -136,17 +137,14 @@ class OpenAIServingEmbedding(OpenAIServing):
 
				             return self.create_error_response(str(e))
			
 
				 
			
 
				         result_generator: AsyncIterator[Tuple[
			
 
				-            int, EmbeddingRequestOutput]] = merge_async_iterators(*generators)
			
 
				+            int, EmbeddingRequestOutput]] = merge_async_iterators(
			
 
				+                *generators, is_cancelled=raw_request.is_disconnected)
			
 
				 
			
 
				         # Non-streaming response
			
 
				         final_res_batch: List[Optional[EmbeddingRequestOutput]]
			
 
				         final_res_batch = [None] * len(prompts)
			
 
				         try:
			
 
				             async for i, res in result_generator:
			
 
				-                if await raw_request.is_disconnected():
			
 
				-                    # Abort the request if the client disconnects.
			
 
				-                    await self.async_engine_client.abort(f"{request_id}-{i}")
			
 
				-                    return self.create_error_response("Client disconnected")
			
 
				                 final_res_batch[i] = res
			
 
				 
			
 
				             for final_res in final_res_batch:
			
@@ -157,6 +155,8 @@ class OpenAIServingEmbedding(OpenAIServing):
 
				             response = request_output_to_embedding_response(
			
 
				                 final_res_batch_checked, request_id, created_time, model_name,
			
 
				                 encoding_format)
			
 
				+        except asyncio.CancelledError:
			
 
				+            return self.create_error_response("Client disconnected")
			
 
				         except ValueError as e:
			
 
				             # TODO: Use an aphrodite-specific Validation Error
			
 
				             return self.create_error_response(str(e))
			
--- a/aphrodite/engine/async_aphrodite.py
+++ b/aphrodite/engine/async_aphrodite.py
@@ -2,7 +2,7 @@ import asyncio
 
				 import os
			
 
				 import time
			
 
				 from functools import partial
			
 
				-from typing import (AsyncIterator, Callable, Dict, Iterable, List, Optional,
			
 
				+from typing import (AsyncGenerator, Callable, Dict, Iterable, List, Optional,
			
 
				                     Set, Tuple, Type, Union)
			
 
				 
			
 
				 from loguru import logger
			
@@ -61,12 +61,16 @@ def _log_task_completion(task: asyncio.Task,
 
				             "actual cause.") from e
			
 
				 
			
 
				 
			
 
				+STOP_ITERATION = Exception() # Sentinel
			
 
				+
			
 
				+
			
 
				 class AsyncStream:
			
 
				     """A stream of RequestOutputs or EmbeddingRequestOutputs for a request
			
 
				-    that can be iterated over asynchronously."""
			
 
				+    that can be iterated over asynchronously via an async generator."""
			
 
				 
			
 
				-    def __init__(self, request_id: str) -> None:
			
 
				+    def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None:
			
 
				         self.request_id = request_id
			
 
				+        self._cancel = cancel
			
 
				         self._queue: asyncio.Queue = asyncio.Queue()
			
 
				         self._finished = False
			
 
				 
			
@@ -76,22 +80,30 @@ class AsyncStream:
 
				             return
			
 
				         self._queue.put_nowait(item)
			
 
				 
			
 
				-    def finish(self) -> None:
			
 
				-        self._queue.put_nowait(StopAsyncIteration())
			
 
				-        self._finished = True
			
 
				+    def finish(self, cancelled: bool = False) -> None:
			
 
				+        if not self._finished:
			
 
				+            self._finished = True
			
 
				+            self._queue.put_nowait(
			
 
				+                asyncio.CancelledError if cancelled else STOP_ITERATION)
			
 
				 
			
 
				     @property
			
 
				     def finished(self) -> bool:
			
 
				         return self._finished
			
 
				 
			
 
				-    def __aiter__(self):
			
 
				-        return self
			
 
				-
			
 
				-    async def __anext__(self) -> Union[RequestOutput, EmbeddingRequestOutput]:
			
 
				-        result = await self._queue.get()
			
 
				-        if isinstance(result, Exception):
			
 
				-            raise result
			
 
				-        return result
			
 
				+    async def generator(
			
 
				+        self
			
 
				+    ) -> AsyncGenerator[Union[RequestOutput, EmbeddingRequestOutput], None]:
			
 
				+        try:
			
 
				+            while not self._finished:
			
 
				+                result = await self._queue.get()
			
 
				+                if isinstance(result, Exception):
			
 
				+                    if result == STOP_ITERATION:
			
 
				+                        return
			
 
				+                    raise result
			
 
				+                yield result
			
 
				+        except GeneratorExit:
			
 
				+            self._cancel(self.request_id)
			
 
				+            raise asyncio.CancelledError from None
			
 
				 
			
 
				 
			
 
				 class RequestTracker:
			
@@ -99,7 +111,7 @@ class RequestTracker:
 
				 
			
 
				     def __init__(self) -> None:
			
 
				         self._request_streams: Dict[str, AsyncStream] = {}
			
 
				-        self._finished_requests: asyncio.Queue[str] = asyncio.Queue()
			
 
				+        self._aborted_requests: asyncio.Queue[str] = asyncio.Queue()
			
 
				         self._new_requests: asyncio.Queue[Tuple[AsyncStream,
			
 
				                                                 dict]] = asyncio.Queue()
			
 
				         self.new_requests_event = asyncio.Event()
			
@@ -130,15 +142,21 @@ class RequestTracker:
 
				                                verbose: bool = False) -> None:
			
 
				         """Process a request output from the engine."""
			
 
				         request_id = request_output.request_id
			
 
				+        finished = request_output.finished
			
 
				 
			
 
				+        if finished:
			
 
				+            stream = self._request_streams.pop(request_id, None)
			
 
				+        else:
			
 
				+            stream = self._request_streams.get(request_id)
			
 
				         # Guard against a KeyError which can occur if the request was aborted
			
 
				         # while the output was generated
			
 
				-        if (stream := self._request_streams.get(request_id)) is not None:
			
 
				+        if stream is not None:
			
 
				             stream.put(request_output)
			
 
				-        if request_output.finished:
			
 
				-            if verbose:
			
 
				-                logger.info(f"Finished request {request_id}.")
			
 
				-            self.abort_request(request_id)
			
 
				+            if finished:
			
 
				+                stream.finish()
			
 
				+
			
 
				+        if verbose and finished:
			
 
				+            logger.info(f"Finished request {request_id}.")
			
 
				 
			
 
				     def process_exception(self,
			
 
				                           request_id: str,
			
@@ -161,7 +179,8 @@ class RequestTracker:
 
				         if request_id in self._request_streams:
			
 
				             raise KeyError(f"Request {request_id} already exists.")
			
 
				 
			
 
				-        stream = AsyncStream(request_id)
			
 
				+        abort_request = partial(self.abort_request, verbose=verbose)
			
 
				+        stream = AsyncStream(request_id, abort_request)
			
 
				         self._new_requests.put_nowait((stream, {
			
 
				             "request_id": request_id,
			
 
				             **engine_add_request_kwargs
			
@@ -174,36 +193,36 @@ class RequestTracker:
 
				 
			
 
				         return stream
			
 
				 
			
 
				-    def abort_request(self, request_id: str, *, verbose: bool = False) -> None:
			
 
				+    def abort_request(self,
			
 
				+                      request_id: str,
			
 
				+                      *,
			
 
				+                      cancelled: bool = False,
			
 
				+                      verbose: bool = False) -> None:
			
 
				         """Abort a request during next background loop iteration."""
			
 
				         if verbose:
			
 
				             logger.info(f"Aborted request {request_id}.")
			
 
				 
			
 
				-        self._finished_requests.put_nowait(request_id)
			
 
				-
			
 
				-        if request_id not in self._request_streams or self._request_streams[
			
 
				-                request_id].finished:
			
 
				-            # The request has already finished or been aborted.
			
 
				-            return
			
 
				+        self._aborted_requests.put_nowait(request_id)
			
 
				 
			
 
				-        self._request_streams[request_id].finish()
			
 
				+        stream = self._request_streams.pop(request_id, None)
			
 
				+        if stream is not None:
			
 
				+            stream.finish(cancelled=cancelled)
			
 
				 
			
 
				-    def get_new_and_finished_requests(self) -> Tuple[List[Dict], Set[str]]:
			
 
				+    def get_new_and_aborted_requests(self) -> Tuple[List[Dict], Set[str]]:
			
 
				         """Get the new requests and finished requests to be
			
 
				         sent to the engine."""
			
 
				         new_requests: List[Dict] = []
			
 
				         finished_requests: Set[str] = set()
			
 
				 
			
 
				-        while not self._finished_requests.empty():
			
 
				-            request_id = self._finished_requests.get_nowait()
			
 
				+        while not self._aborted_requests.empty():
			
 
				+            request_id = self._aborted_requests.get_nowait()
			
 
				             finished_requests.add(request_id)
			
 
				-            self._request_streams.pop(request_id, None)
			
 
				 
			
 
				         while not self._new_requests.empty():
			
 
				             stream, new_request = self._new_requests.get_nowait()
			
 
				             if stream.request_id in finished_requests:
			
 
				                 # The request has already been aborted.
			
 
				-                stream.finish()
			
 
				+                stream.finish(cancelled=True)
			
 
				                 continue
			
 
				             self._request_streams[stream.request_id] = stream
			
 
				             new_requests.append(new_request)
			
@@ -554,8 +573,8 @@ class AsyncAphrodite:
 
				 
			
 
				         Returns True if there are in-progress requests."""
			
 
				 
			
 
				-        new_requests, finished_requests = (
			
 
				-            self._request_tracker.get_new_and_finished_requests())
			
 
				+        new_requests, aborted_requests = (
			
 
				+            self._request_tracker.get_new_and_aborted_requests())
			
 
				 
			
 
				         for new_request in new_requests:
			
 
				             # Add the request into the Aphrodite engine's waiting queue.
			
@@ -574,8 +593,8 @@ class AsyncAphrodite:
 
				                     verbose=self.log_requests,
			
 
				                 )
			
 
				 
			
 
				-        if finished_requests:
			
 
				-            await self._engine_abort(finished_requests)
			
 
				+        if aborted_requests:
			
 
				+            await self._engine_abort(aborted_requests)
			
 
				 
			
 
				         if self.engine_use_ray:
			
 
				             request_outputs = await self.engine.step.remote()  # type: ignore
			
@@ -664,6 +683,8 @@ class AsyncAphrodite:
 
				                 raise
			
 
				             await asyncio.sleep(0)
			
 
				 
			
 
				+    # This method does not need to be async, but kept that way
			
 
				+    # for backwards compatibility.
			
 
				     async def add_request(
			
 
				         self,
			
 
				         request_id: str,
			
@@ -672,7 +693,7 @@ class AsyncAphrodite:
 
				         arrival_time: Optional[float] = None,
			
 
				         lora_request: Optional[LoRARequest] = None,
			
 
				         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
			
 
				-    ) -> AsyncStream:
			
 
				+    ) -> AsyncGenerator[Union[RequestOutput, EmbeddingRequestOutput], None]:
			
 
				 
			
 
				         if not self.is_running:
			
 
				             if self.start_engine_loop:
			
@@ -684,19 +705,16 @@ class AsyncAphrodite:
 
				                     "error that caused the background loop to stop "
			
 
				                     "(AsyncEngineDeadError).")
			
 
				 
			
 
				-        if arrival_time is None:
			
 
				-            arrival_time = time.time()
			
 
				-
			
 
				         stream = self._request_tracker.add_request(
			
 
				             request_id,
			
 
				             verbose=self.log_requests,
			
 
				             inputs=inputs,
			
 
				             params=params,
			
 
				-            arrival_time=arrival_time,
			
 
				+            arrival_time=arrival_time or time.time(),
			
 
				             lora_request=lora_request,
			
 
				             prompt_adapter_request=prompt_adapter_request)
			
 
				 
			
 
				-        return stream
			
 
				+        return stream.generator()
			
 
				 
			
 
				     async def generate(
			
 
				         self,
			
@@ -705,7 +723,7 @@ class AsyncAphrodite:
 
				         request_id: str,
			
 
				         lora_request: Optional[LoRARequest] = None,
			
 
				         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
			
 
				-    ) -> AsyncIterator[RequestOutput]:
			
 
				+    ) -> AsyncGenerator[RequestOutput, None]:
			
 
				         """Generate outputs for a request.
			
 
				 
			
 
				         Generate outputs for a request. This method is a coroutine. It adds the
			
@@ -771,7 +789,7 @@ class AsyncAphrodite:
 
				             >>> # Process and return the final output
			
 
				             >>> ...
			
 
				         """
			
 
				-        async for output in self._process_request(
			
 
				+        async for output in await self.add_request(
			
 
				                 request_id,
			
 
				                 inputs,
			
 
				                 sampling_params,
			
@@ -786,7 +804,7 @@ class AsyncAphrodite:
 
				         pooling_params: PoolingParams,
			
 
				         request_id: str,
			
 
				         lora_request: Optional[LoRARequest] = None,
			
 
				-    ) -> AsyncIterator[EmbeddingRequestOutput]:
			
 
				+    ) -> AsyncGenerator[EmbeddingRequestOutput, None]:
			
 
				         """Generate outputs for a request from an embedding model.
			
 
				         Generate outputs for a request. This method is a coroutine. It adds the
			
 
				         request into the waiting queue of the AphroditeEngine and streams the
			
@@ -840,7 +858,7 @@ class AsyncAphrodite:
 
				             >>> # Process and return the final output
			
 
				             >>> ...
			
 
				         """
			
 
				-        async for output in self._process_request(
			
 
				+        async for output in await self.add_request(
			
 
				                 request_id,
			
 
				                 inputs,
			
 
				                 pooling_params,
			
@@ -849,34 +867,6 @@ class AsyncAphrodite:
 
				             yield AphroditeEngine.validate_output(output,
			
 
				                                                   EmbeddingRequestOutput)
			
 
				 
			
 
				-    async def _process_request(
			
 
				-        self,
			
 
				-        request_id: str,
			
 
				-        inputs: PromptInputs,
			
 
				-        params: Union[SamplingParams, PoolingParams],
			
 
				-        *,
			
 
				-        lora_request: Optional[LoRARequest] = None,
			
 
				-        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
			
 
				-    ) -> AsyncIterator[Union[RequestOutput, EmbeddingRequestOutput]]:
			
 
				-        """Common logic to process requests with SamplingParams or
			
 
				-        PoolingParams."""
			
 
				-        arrival_time = time.time()
			
 
				-
			
 
				-        stream = await self.add_request(
			
 
				-            request_id,
			
 
				-            inputs,
			
 
				-            params,
			
 
				-            arrival_time=arrival_time,
			
 
				-            lora_request=lora_request,
			
 
				-            prompt_adapter_request=prompt_adapter_request)
			
 
				-
			
 
				-        try:
			
 
				-            async for request_output in stream:
			
 
				-                yield request_output
			
 
				-        except (Exception, asyncio.CancelledError) as e:
			
 
				-            self._abort(request_id)
			
 
				-            raise e
			
 
				-
			
 
				     async def abort(self, request_id: str) -> None:
			
 
				         """Abort a request.
			
 
				 
			
@@ -905,6 +895,7 @@ class AsyncAphrodite:
 
				             request_id: The unique id of the request.
			
 
				         """
			
 
				         self._request_tracker.abort_request(request_id,
			
 
				+                                            cancelled=True,
			
 
				                                             verbose=self.log_requests)
			
 
				 
			
 
				     async def get_model_config(self) -> ModelConfig:
			
--- a/aphrodite/engine/protocol.py
+++ b/aphrodite/engine/protocol.py
@@ -1,4 +1,4 @@
 
				-from typing import AsyncIterator, List, Optional, Protocol, runtime_checkable
			
 
				+from typing import AsyncGenerator, List, Optional, Protocol, runtime_checkable
			
 
				 
			
 
				 from transformers import PreTrainedTokenizer
			
 
				 
			
@@ -29,24 +29,24 @@ class AsyncEngineClient(Protocol):
 
				     def errored(self) -> bool:
			
 
				         ...
			
 
				 
			
 
				-    async def generate(
			
 
				+    def generate(
			
 
				         self,
			
 
				         inputs: PromptInputs,
			
 
				         sampling_params: SamplingParams,
			
 
				         request_id: str,
			
 
				         lora_request: Optional[LoRARequest] = None,
			
 
				         prompt_adapter_request: Optional[PromptAdapterRequest] = None
			
 
				-    ) -> AsyncIterator[RequestOutput]:
			
 
				+    ) -> AsyncGenerator[RequestOutput, None]:
			
 
				         """Generates outputs for a request"""
			
 
				         ...
			
 
				 
			
 
				-    async def encode(
			
 
				+    def encode(
			
 
				         self,
			
 
				         inputs: PromptInputs,
			
 
				         pooling_params: PoolingParams,
			
 
				         request_id: str,
			
 
				         lora_request: Optional[LoRARequest] = None,
			
 
				-    ) -> AsyncIterator[EmbeddingRequestOutput]:
			
 
				+    ) -> AsyncGenerator[EmbeddingRequestOutput, None]:
			
 
				         """Generate outputs for a request from an embedding model."""
			
 
				         ...