|
@@ -7,6 +7,7 @@ from typing import Tuple, Type, TypeVar, Union
|
|
|
|
|
|
from loguru import logger
|
|
|
from transformers import PreTrainedTokenizer
|
|
|
+from typing_extensions import assert_never
|
|
|
|
|
|
from aphrodite.common.config import (CacheConfig, DecodingConfig, DeviceConfig,
|
|
|
EngineConfig, LoadConfig, LoRAConfig,
|
|
@@ -22,8 +23,7 @@ from aphrodite.common.sequence import (EmbeddingSequenceGroupOutput,
|
|
|
ExecuteModelRequest, PoolerOutput,
|
|
|
SamplerOutput, Sequence, SequenceGroup,
|
|
|
SequenceGroupMetadata, SequenceStatus)
|
|
|
-from aphrodite.common.utils import (Counter, is_embedding_model_config,
|
|
|
- is_encoder_decoder_model_config)
|
|
|
+from aphrodite.common.utils import Counter
|
|
|
from aphrodite.engine.args_tools import EngineArgs
|
|
|
from aphrodite.engine.metrics import (LoggingStatLogger, PrometheusStatLogger,
|
|
|
StatLoggerBase, Stats)
|
|
@@ -34,9 +34,11 @@ from aphrodite.engine.output_processor.util import (
|
|
|
create_output_by_sequence_group)
|
|
|
from aphrodite.executor.executor_base import ExecutorBase
|
|
|
from aphrodite.executor.ray_utils import initialize_ray_cluster
|
|
|
-from aphrodite.inputs import (INPUT_REGISTRY, LLMInputs, PromptInputs,
|
|
|
- get_prompt_type)
|
|
|
+from aphrodite.inputs import (INPUT_REGISTRY, EncoderDecoderLLMInputs,
|
|
|
+ LLMInputs, PromptInputs, SingletonPromptInputs)
|
|
|
+from aphrodite.inputs.parse import is_explicit_encoder_decoder_prompt
|
|
|
from aphrodite.lora.request import LoRARequest
|
|
|
+from aphrodite.multimodal import MultiModalDataDict
|
|
|
from aphrodite.processing.scheduler import (ScheduledSequenceGroup, Scheduler,
|
|
|
SchedulerOutputs)
|
|
|
from aphrodite.prompt_adapter.request import PromptAdapterRequest
|
|
@@ -67,6 +69,11 @@ def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]:
|
|
|
|
|
|
_O = TypeVar("_O", RequestOutput, EmbeddingRequestOutput)
|
|
|
|
|
|
+PromptComponents = Tuple[Optional[str], List[int],
|
|
|
+ Optional[MultiModalDataDict]]
|
|
|
+DecoderPromptComponents = Tuple[Optional[str], Optional[List[int]],
|
|
|
+ Optional[MultiModalDataDict]]
|
|
|
+
|
|
|
|
|
|
class AphroditeEngine:
|
|
|
"""An LLM engine that receives requests and generates texts.
|
|
@@ -472,7 +479,7 @@ class AphroditeEngine:
|
|
|
|
|
|
return self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id
|
|
|
|
|
|
- def _get_decoder_start_token_id(self, ) -> Optional[int]:
|
|
|
+ def _get_decoder_start_token_id(self) -> Optional[int]:
|
|
|
'''
|
|
|
Obtain the decoder start token id employed by an encoder/decoder
|
|
|
model. Returns None for non-encoder/decoder models or if the
|
|
@@ -501,7 +508,7 @@ class AphroditeEngine:
|
|
|
def _add_processed_request(
|
|
|
self,
|
|
|
request_id: str,
|
|
|
- processed_inputs: LLMInputs,
|
|
|
+ processed_inputs: Union[LLMInputs, EncoderDecoderLLMInputs],
|
|
|
params: Union[SamplingParams, PoolingParams],
|
|
|
arrival_time: float,
|
|
|
lora_request: Optional[LoRARequest],
|
|
@@ -561,11 +568,11 @@ class AphroditeEngine:
|
|
|
def stop_remote_worker_execution_loop(self) -> None:
|
|
|
self.model_executor.stop_remote_worker_execution_loop()
|
|
|
|
|
|
- _LLMInputComponentsType = Tuple[str, List[int], ]
|
|
|
+ _LLMInputComponentsType = Tuple[str, List[int]]
|
|
|
|
|
|
def _prepare_decoder_input_ids_for_generation(
|
|
|
self,
|
|
|
- decoder_input_ids: Optional[List[int]] = None,
|
|
|
+ decoder_input_ids: Optional[List[int]],
|
|
|
) -> List[int]:
|
|
|
"""
|
|
|
Prepares `decoder_input_ids` for generation with encoder-decoder models.
|
|
@@ -580,14 +587,13 @@ class AphroditeEngine:
|
|
|
* Processed token list
|
|
|
"""
|
|
|
|
|
|
- decoder_start_token_id: Optional[int] = (
|
|
|
- self._get_decoder_start_token_id())
|
|
|
+ decoder_start_token_id = self._get_decoder_start_token_id()
|
|
|
assert decoder_start_token_id is not None
|
|
|
|
|
|
if decoder_input_ids is None:
|
|
|
# no decoder prompt input ->
|
|
|
# use decoder_start_token_id as decoder_input_ids
|
|
|
- (decoder_input_ids) = self._get_default_enc_dec_decoder_prompt()
|
|
|
+ decoder_input_ids = self._get_default_enc_dec_decoder_prompt()
|
|
|
|
|
|
if (len(decoder_input_ids) == 0
|
|
|
or decoder_input_ids[0] != decoder_start_token_id):
|
|
@@ -598,12 +604,11 @@ class AphroditeEngine:
|
|
|
def _tokenize_prompt(
|
|
|
self,
|
|
|
prompt: str,
|
|
|
- request_id: Optional[str] = None,
|
|
|
- lora_request: Optional[str] = None,
|
|
|
+ request_id: str,
|
|
|
+ lora_request: Optional[LoRARequest],
|
|
|
) -> List[int]:
|
|
|
'''
|
|
|
- Wrapper around application of the model's
|
|
|
- tokenizer.
|
|
|
+ Wrapper around application of the model's tokenizer.
|
|
|
Arguments:
|
|
|
* prompt
|
|
|
* request_id
|
|
@@ -615,81 +620,68 @@ class AphroditeEngine:
|
|
|
tokenizer = self.get_tokenizer_group("prompts must be None if "
|
|
|
"skip_tokenizer_init is True")
|
|
|
|
|
|
- prompt_token_ids = tokenizer.encode(request_id=request_id,
|
|
|
- prompt=prompt,
|
|
|
- lora_request=lora_request)
|
|
|
+ return tokenizer.encode(request_id=request_id,
|
|
|
+ prompt=prompt,
|
|
|
+ lora_request=lora_request)
|
|
|
|
|
|
- return prompt_token_ids
|
|
|
-
|
|
|
- def _extract_single_prompt_for_enc_dec_input(
|
|
|
+ def _extract_prompt_components(
|
|
|
self,
|
|
|
- inputs: Optional[PromptInputs],
|
|
|
- request_id: Optional[str] = None,
|
|
|
- ptype: Optional[str] = None,
|
|
|
- is_encoder_prompt: bool = False,
|
|
|
- ) -> Tuple[Optional[str], List[int]]:
|
|
|
+ inputs: SingletonPromptInputs,
|
|
|
+ request_id: str,
|
|
|
+ lora_request: Optional[LoRARequest] = None,
|
|
|
+ ) -> PromptComponents:
|
|
|
'''
|
|
|
- Only for encoder/decoder models:
|
|
|
- Extract prompt & prompt_token_ids from any single
|
|
|
- encoder or decoder input prompt. For encoder input prompts
|
|
|
- in particular, also extract multi-modal data.
|
|
|
- This function handles the following scenarios:
|
|
|
- 1. The user supplied a singleton encoder prompt
|
|
|
- & the prompt/prompt-token-ids must be extracted.
|
|
|
- 2. The user supplied an explicit encoder/decoder
|
|
|
- prompt & the prompt/prompt-token-ids must be
|
|
|
- extracted from either the encoder and decoder prompts.
|
|
|
- For decoder prompts in particular (scenario 2), special
|
|
|
- processing is applied to the returned decoder token ids.
|
|
|
+ Extract the components of any single encoder or decoder input prompt.
|
|
|
Arguments:
|
|
|
* request_id
|
|
|
- * ptype: str representation of the input prompt type.
|
|
|
- If `ptype` is `None`, assume that the prompt
|
|
|
- type is unknown and must be inferred. This is the
|
|
|
- case for ExplicitEncoderDecoder sub-prompts.
|
|
|
* inputs: single encoder or decoder input prompt
|
|
|
- * is_encoder_prompt: True if encoder input prompt.
|
|
|
- If False, decoder prompt tokens
|
|
|
- are preprocessed.
|
|
|
+ * lora_request: this is only valid for decoder prompts
|
|
|
Returns:
|
|
|
* prompt
|
|
|
* prompt_token_ids
|
|
|
+ * multi_modal_data
|
|
|
'''
|
|
|
- prompt_token_ids = None
|
|
|
- ptype = (get_prompt_type(inputs) if ptype is None else ptype)
|
|
|
|
|
|
- if inputs is None:
|
|
|
- prompt = None
|
|
|
- elif ptype == 'str':
|
|
|
+ if isinstance(inputs, str):
|
|
|
prompt = inputs
|
|
|
prompt_token_ids = self._tokenize_prompt(
|
|
|
prompt,
|
|
|
request_id=request_id,
|
|
|
+ lora_request=lora_request,
|
|
|
)
|
|
|
- elif ptype == 'TokensPrompt':
|
|
|
- prompt = None
|
|
|
- prompt_token_ids = inputs['prompt_token_ids']
|
|
|
+ multi_modal_data = None
|
|
|
+ elif isinstance(inputs, dict):
|
|
|
+ if "prompt_token_ids" in inputs:
|
|
|
+ prompt = None
|
|
|
+ prompt_token_ids = inputs["prompt_token_ids"]
|
|
|
+ else:
|
|
|
+ # NOTE: This extra assignment is required to pass mypy
|
|
|
+ prompt = parsed_prompt = inputs["prompt"]
|
|
|
+ prompt_token_ids = self._tokenize_prompt(
|
|
|
+ parsed_prompt,
|
|
|
+ request_id=request_id,
|
|
|
+ lora_request=lora_request,
|
|
|
+ )
|
|
|
+
|
|
|
+ multi_modal_data = inputs.get("multi_modal_data")
|
|
|
else:
|
|
|
- prompt = inputs['prompt']
|
|
|
- prompt_token_ids = self._tokenize_prompt(
|
|
|
- prompt,
|
|
|
- request_id=request_id,
|
|
|
- )
|
|
|
+ assert_never(inputs)
|
|
|
|
|
|
- if not is_encoder_prompt:
|
|
|
- # Apply special pre-processing to
|
|
|
- # decoder prompts
|
|
|
- prompt_token_ids = (self._prepare_decoder_input_ids_for_generation(
|
|
|
- prompt_token_ids, ))
|
|
|
+ return prompt, prompt_token_ids, multi_modal_data
|
|
|
|
|
|
- assert prompt_token_ids is not None
|
|
|
+ def _apply_prompt_adapter(
|
|
|
+ self,
|
|
|
+ prompt_token_ids: List[int],
|
|
|
+ prompt_adapter_request: Optional[PromptAdapterRequest],
|
|
|
+ ) -> List[int]:
|
|
|
+ if prompt_adapter_request:
|
|
|
+ prompt_token_ids = (
|
|
|
+ [0] * prompt_adapter_request.prompt_adapter_num_virtual_tokens
|
|
|
+ + prompt_token_ids)
|
|
|
|
|
|
- return (
|
|
|
- prompt,
|
|
|
- prompt_token_ids,
|
|
|
- )
|
|
|
+ return prompt_token_ids
|
|
|
|
|
|
- def _get_default_enc_dec_decoder_prompt(self, ) -> List[int]:
|
|
|
+ def _get_default_enc_dec_decoder_prompt(self) -> List[int]:
|
|
|
'''
|
|
|
Specifically for encoder/decoder models:
|
|
|
generate a default decoder prompt for when
|
|
@@ -718,18 +710,39 @@ class AphroditeEngine:
|
|
|
|
|
|
bos_token_id = self._get_bos_token_id()
|
|
|
assert bos_token_id is not None
|
|
|
- prompt_token_ids: List[int] = [bos_token_id]
|
|
|
- return prompt_token_ids
|
|
|
+ return [bos_token_id]
|
|
|
+
|
|
|
+ def _build_enc_dec_llm_inputs(
|
|
|
+ self,
|
|
|
+ encoder_comps: PromptComponents,
|
|
|
+ decoder_comps: DecoderPromptComponents,
|
|
|
+ ) -> EncoderDecoderLLMInputs:
|
|
|
+ encoder_prompt, encoder_prompt_ids, encoder_mm_data = encoder_comps
|
|
|
+ decoder_prompt, decoder_prompt_ids, decoder_mm_data = decoder_comps
|
|
|
+
|
|
|
+ if encoder_mm_data is not None or decoder_mm_data is not None:
|
|
|
+ raise ValueError("Multi-modal encoder-decoder models are "
|
|
|
+ "not supported yet")
|
|
|
+
|
|
|
+ decoder_prompt_ids = (
|
|
|
+ self._prepare_decoder_input_ids_for_generation(decoder_prompt_ids))
|
|
|
+
|
|
|
+ return EncoderDecoderLLMInputs(
|
|
|
+ prompt_token_ids=decoder_prompt_ids,
|
|
|
+ prompt=decoder_prompt,
|
|
|
+ encoder_prompt_token_ids=encoder_prompt_ids,
|
|
|
+ encoder_prompt=encoder_prompt,
|
|
|
+ )
|
|
|
|
|
|
def _process_encoder_decoder_prompt(
|
|
|
self,
|
|
|
inputs: PromptInputs,
|
|
|
- request_id: Optional[str] = None,
|
|
|
- ) -> LLMInputs:
|
|
|
+ request_id: str,
|
|
|
+ ) -> EncoderDecoderLLMInputs:
|
|
|
'''
|
|
|
For encoder/decoder models only:
|
|
|
- Process an input prompt
|
|
|
- into an `LLMInputs` instance.
|
|
|
+ Process an input prompt into an
|
|
|
+ :class:`EncoderDecoderLLMInputs` instance.
|
|
|
There are two types of input prompts:
|
|
|
singleton prompts which carry only the
|
|
|
encoder prompt, and explicit encoder/decoder
|
|
@@ -750,131 +763,98 @@ class AphroditeEngine:
|
|
|
* inputs: an input prompt
|
|
|
* request_id
|
|
|
Returns:
|
|
|
- * `LLMInputs` instance
|
|
|
+ * :class:`EncoderDecoderLLMInputs` instance
|
|
|
'''
|
|
|
|
|
|
- ptype = get_prompt_type(inputs)
|
|
|
-
|
|
|
- # Obtain encoder and decoder prompt tokens. Note
|
|
|
- # that, no matter what, the decoder
|
|
|
- # prompt type is unknown.
|
|
|
- if ptype == "ExplicitEncoderDecoder":
|
|
|
- # If input is explicit encoder/decoder prompt,
|
|
|
- # then it remains to be determined what type
|
|
|
- # of encoder prompt we have
|
|
|
- extracted_encoder_prompt = inputs.get('encoder_prompt')
|
|
|
- encoder_ptype = None
|
|
|
- # Extract decoder prompt from explicit
|
|
|
- # encoder/decoder prompt
|
|
|
- extracted_decoder_prompt = inputs.get('decoder_prompt')
|
|
|
+ encoder_comps: PromptComponents
|
|
|
+ decoder_comps: DecoderPromptComponents
|
|
|
+
|
|
|
+ if is_explicit_encoder_decoder_prompt(inputs):
|
|
|
+ encoder_comps = self._extract_prompt_components(
|
|
|
+ inputs["encoder_prompt"],
|
|
|
+ request_id=request_id,
|
|
|
+ )
|
|
|
+
|
|
|
+ if (decoder_input := inputs["decoder_prompt"]) is None:
|
|
|
+ decoder_comps = None, None, None
|
|
|
+ else:
|
|
|
+ decoder_comps = self._extract_prompt_components(
|
|
|
+ decoder_input,
|
|
|
+ request_id=request_id,
|
|
|
+ )
|
|
|
else:
|
|
|
- # If input is singleton encoder prompt, then
|
|
|
- # we know the encoder prompt type
|
|
|
- extracted_encoder_prompt = inputs
|
|
|
- encoder_ptype = ptype
|
|
|
- # Decoder prompt is always unknown if
|
|
|
- # encoder/decoder prompt is not explicit
|
|
|
- extracted_decoder_prompt = None
|
|
|
-
|
|
|
- # Invoke helper function to obtain encoder
|
|
|
- # prompt and prompt token ids, either from
|
|
|
- # singleton encoder prompt or from the
|
|
|
- # encoder sub-prompt of an explicit
|
|
|
- # encoder/decode scenario 2), special
|
|
|
- # processing is applied to the returned decoder token ids
|
|
|
- (
|
|
|
- encoder_prompt,
|
|
|
- encoder_prompt_token_ids,
|
|
|
- ) = self._extract_single_prompt_for_enc_dec_input(
|
|
|
- extracted_encoder_prompt,
|
|
|
- request_id=request_id,
|
|
|
- ptype=encoder_ptype,
|
|
|
- is_encoder_prompt=True,
|
|
|
- )
|
|
|
+ encoder_comps = self._extract_prompt_components(
|
|
|
+ inputs,
|
|
|
+ request_id=request_id,
|
|
|
+ )
|
|
|
|
|
|
- # Invoke helper method to obtain
|
|
|
- # decoder prompt and prompt token ids.
|
|
|
- #
|
|
|
- # The helper method will detect the decoder
|
|
|
- # prompt type.
|
|
|
- #
|
|
|
- # Helper method will also apply special
|
|
|
- # preprocessing unique to decoder prompts.
|
|
|
- (
|
|
|
- decoder_prompt,
|
|
|
- decoder_prompt_token_ids,
|
|
|
- ) = self._extract_single_prompt_for_enc_dec_input(
|
|
|
- extracted_decoder_prompt,
|
|
|
- request_id=request_id,
|
|
|
- ptype=None,
|
|
|
- is_encoder_prompt=False,
|
|
|
- )
|
|
|
+ decoder_comps = None, None, None
|
|
|
|
|
|
- return LLMInputs(
|
|
|
- prompt_token_ids=decoder_prompt_token_ids,
|
|
|
- prompt=decoder_prompt,
|
|
|
- encoder_prompt_token_ids=encoder_prompt_token_ids,
|
|
|
- encoder_prompt=encoder_prompt,
|
|
|
- )
|
|
|
+ return self._build_enc_dec_llm_inputs(encoder_comps, decoder_comps)
|
|
|
+
|
|
|
+ def _build_decoder_only_llm_inputs(
|
|
|
+ self,
|
|
|
+ prompt_comps: PromptComponents,
|
|
|
+ prompt_adapter_request: Optional[PromptAdapterRequest],
|
|
|
+ ) -> LLMInputs:
|
|
|
+ prompt, prompt_token_ids, multi_modal_data = prompt_comps
|
|
|
+
|
|
|
+ prompt_token_ids = self._apply_prompt_adapter(
|
|
|
+ prompt_token_ids, prompt_adapter_request=prompt_adapter_request)
|
|
|
+
|
|
|
+ return LLMInputs(prompt_token_ids=prompt_token_ids,
|
|
|
+ prompt=prompt,
|
|
|
+ multi_modal_data=multi_modal_data)
|
|
|
|
|
|
def _process_decoder_only_prompt(
|
|
|
self,
|
|
|
- inputs: PromptInputs,
|
|
|
+ inputs: SingletonPromptInputs,
|
|
|
+ request_id: str,
|
|
|
lora_request: Optional[LoRARequest] = None,
|
|
|
- request_id: Optional[str] = None,
|
|
|
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
|
|
) -> LLMInputs:
|
|
|
'''
|
|
|
For decoder-only models:
|
|
|
- Process an input prompt
|
|
|
- into an `LLMInputs` instance.
|
|
|
+ Process an input prompt into an :class:`LLMInputs` instance.
|
|
|
Arguments:
|
|
|
* inputs: input prompt
|
|
|
- * lora_request
|
|
|
* request_id
|
|
|
+ * lora_request
|
|
|
* prompt_adapter_request
|
|
|
Returns:
|
|
|
- * `LLMInputs` instance
|
|
|
+ * :class:`LLMInputs` instance
|
|
|
'''
|
|
|
- if isinstance(inputs, str):
|
|
|
- inputs = {"prompt": inputs}
|
|
|
- prompt = inputs.get("prompt")
|
|
|
-
|
|
|
- if "prompt_token_ids" not in inputs:
|
|
|
- prompt_token_ids = self._tokenize_prompt(
|
|
|
- prompt,
|
|
|
- request_id=request_id,
|
|
|
- lora_request=lora_request,
|
|
|
- )
|
|
|
- else:
|
|
|
- prompt_token_ids = inputs["prompt_token_ids"]
|
|
|
|
|
|
- if prompt_adapter_request:
|
|
|
- prompt_token_ids = (
|
|
|
- [0] * prompt_adapter_request.prompt_adapter_num_virtual_tokens
|
|
|
- + prompt_token_ids)
|
|
|
+ prompt_comps = self._extract_prompt_components(
|
|
|
+ inputs,
|
|
|
+ request_id=request_id,
|
|
|
+ lora_request=lora_request,
|
|
|
+ )
|
|
|
|
|
|
- return LLMInputs(prompt_token_ids=prompt_token_ids,
|
|
|
- prompt=prompt,
|
|
|
- multi_modal_data=inputs.get("multi_modal_data"))
|
|
|
+ return self._build_decoder_only_llm_inputs(
|
|
|
+ prompt_comps,
|
|
|
+ prompt_adapter_request=prompt_adapter_request,
|
|
|
+ )
|
|
|
|
|
|
def process_model_inputs(
|
|
|
self,
|
|
|
- request_id: str,
|
|
|
inputs: PromptInputs,
|
|
|
+ request_id: str,
|
|
|
lora_request: Optional[LoRARequest] = None,
|
|
|
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
|
|
- ) -> LLMInputs:
|
|
|
+ ) -> Union[LLMInputs, EncoderDecoderLLMInputs]:
|
|
|
|
|
|
if self.is_encoder_decoder_model():
|
|
|
# Encoder-decoder model requires special mapping of
|
|
|
# input prompts to encoder & decoder
|
|
|
-
|
|
|
model_inputs = self._process_encoder_decoder_prompt(
|
|
|
inputs,
|
|
|
request_id=request_id,
|
|
|
)
|
|
|
else:
|
|
|
+ if is_explicit_encoder_decoder_prompt(inputs):
|
|
|
+ raise ValueError("Cannot pass encoder-decoder prompt "
|
|
|
+ "to decoder-only models")
|
|
|
# Decoder-only operation
|
|
|
model_inputs = self._process_decoder_only_prompt(
|
|
|
inputs,
|
|
@@ -945,10 +925,11 @@ class AphroditeEngine:
|
|
|
arrival_time = time.time()
|
|
|
|
|
|
processed_inputs = self.process_model_inputs(
|
|
|
+ inputs,
|
|
|
request_id=request_id,
|
|
|
- inputs=inputs,
|
|
|
lora_request=lora_request,
|
|
|
- prompt_adapter_request=prompt_adapter_request)
|
|
|
+ prompt_adapter_request=prompt_adapter_request,
|
|
|
+ )
|
|
|
|
|
|
self._add_processed_request(
|
|
|
request_id=request_id,
|
|
@@ -1450,10 +1431,10 @@ class AphroditeEngine:
|
|
|
self.model_executor.check_health()
|
|
|
|
|
|
def is_encoder_decoder_model(self):
|
|
|
- return is_encoder_decoder_model_config(self.model_config)
|
|
|
+ return self.model_config.is_encoder_decoder_model
|
|
|
|
|
|
def is_embedding_model(self):
|
|
|
- return is_embedding_model_config(self.model_config)
|
|
|
+ return self.model_config.is_embedding_model
|
|
|
|
|
|
|
|
|
setup_logger()
|