|
@@ -1,9 +1,8 @@
|
|
|
-"""Compare the outputs of HF and Aphrodite for BART models using greedy
|
|
|
-sampling.
|
|
|
+"""Compare the outputs of HF and Aphrodite for BART models using greedy sampling.
|
|
|
|
|
|
-Run `pytest tests/models/test_bart.py`.
|
|
|
+Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
|
|
|
"""
|
|
|
-from typing import List, Optional, Tuple
|
|
|
+from typing import List, Optional, Tuple, Type
|
|
|
|
|
|
from aphrodite.common.utils import is_cpu
|
|
|
|
|
@@ -17,8 +16,10 @@ if not is_cpu():
|
|
|
|
|
|
from aphrodite.common.sequence import SampleLogprobs
|
|
|
|
|
|
- from ..conftest import DecoderPromptType
|
|
|
- from .utils import check_logprobs_close
|
|
|
+ from ....conftest import (AphroditeRunner, DecoderPromptType,
|
|
|
+ ExplicitEncoderDecoderPrompt, HfRunner)
|
|
|
+ from ....utils import multi_gpu_test
|
|
|
+ from ...utils import check_logprobs_close
|
|
|
|
|
|
MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"]
|
|
|
|
|
@@ -35,24 +36,22 @@ if not is_cpu():
|
|
|
|
|
|
return output_ids, hf_output_str, out_logprobs
|
|
|
|
|
|
- @pytest.mark.parametrize("model", MODELS)
|
|
|
- @pytest.mark.parametrize("dtype", ["float", "bfloat16"])
|
|
|
- @pytest.mark.parametrize("max_tokens", [64])
|
|
|
- @pytest.mark.parametrize("num_logprobs", [5])
|
|
|
- @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
|
|
|
- def test_models(
|
|
|
- hf_runner,
|
|
|
- aphrodite_runner,
|
|
|
- example_encoder_decoder_prompts,
|
|
|
+ def run_test(
|
|
|
+ hf_runner: Type[HfRunner],
|
|
|
+ aphrodite_runner: Type[AphroditeRunner],
|
|
|
+ prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
|
|
|
+ decoder_prompt_type: DecoderPromptType,
|
|
|
model: str,
|
|
|
+ *,
|
|
|
dtype: str,
|
|
|
max_tokens: int,
|
|
|
num_logprobs: int,
|
|
|
- decoder_prompt_type: DecoderPromptType,
|
|
|
+ tensor_parallel_size: int,
|
|
|
+ distributed_executor_backend: Optional[str] = None,
|
|
|
) -> None:
|
|
|
'''
|
|
|
- Test the Aphrodite BART model for a variety of encoder/decoder
|
|
|
- input prompts, by validating it against HuggingFace (HF) BART.
|
|
|
+ Test the Aphrodite BART model for a variety of encoder/decoder input prompts,
|
|
|
+ by validating it against HuggingFace (HF) BART.
|
|
|
|
|
|
Arguments:
|
|
|
|
|
@@ -88,23 +87,22 @@ if not is_cpu():
|
|
|
then (4) after computing logits during prefill, override the model
|
|
|
logits & force <BOS> to be the first generated token.
|
|
|
|
|
|
- * Aphrodite will (1) tokenize the None prompt as [<BOS>], (2) append
|
|
|
- decoder-start-token to the beginning, yielding
|
|
|
- [<decoder-start-token><BOS>], (3) pass these tokens to the model &
|
|
|
- proceed with generation.
|
|
|
+ * Aphrodite will (1) tokenize the None prompt as [<BOS>], (2) append decoder-
|
|
|
+ start-token to the beginning, yielding [<decoder-start-token><BOS>],
|
|
|
+ (3) pass these tokens to the model & proceed with generation.
|
|
|
+
|
|
|
+ The net effect is that compared to Aphrodite, the list of HF *decoded* tokens
|
|
|
+ will contain one more initial <BOS> than the Aphrodite generated tokens,
|
|
|
+ because Aphrodite's <BOS> token is injected into the prompt rather than into
|
|
|
+ the generated output. This is in spite of the fact that overall, the
|
|
|
+ complete sequences (prompt + decoded tokens) produced by Aphrodite will match
|
|
|
+ HF.
|
|
|
+
|
|
|
+ So when we use HF decoded token output to validate Aphrodite's decoded token
|
|
|
+ output, the testing process must account for the difference in decoded
|
|
|
+ token sequences between Aphrodite and HF specifically in the
|
|
|
+ decoder-prompt-is-None case.
|
|
|
|
|
|
- The net effect is that compared to Aphrodite, the list of HF
|
|
|
- *decoded* tokens will contain one more initial <BOS> than the
|
|
|
- Aphrodite generated tokens, because Aphrodite's <BOS> token is
|
|
|
- injected into the prompt rather than into the generated output.
|
|
|
- This is in spite of the fact that overall, the complete sequences
|
|
|
- (prompt + decoded tokens) produced by Aphrodite will match HF.
|
|
|
-
|
|
|
- So when we use HF decoded token output to validate Aphrodite's decoded
|
|
|
- token output, the testing process must account for the difference in
|
|
|
- decoded token sequences between Aphrodite and HF specifically in the
|
|
|
- decoder-prompt-is-None case.
|
|
|
-
|
|
|
One option is to disable the logit processor feature that forces the
|
|
|
<BOS> token to be decoded (forced_bos_token_id = None), eliminating
|
|
|
the problem entirely. However this is not "normal" BART usage.
|
|
@@ -118,8 +116,29 @@ if not is_cpu():
|
|
|
token during the process of validating the Aphrodite decoded output.
|
|
|
'''
|
|
|
|
|
|
- test_case_prompts = example_encoder_decoder_prompts[
|
|
|
- decoder_prompt_type]
|
|
|
+ # NOTE: take care of the order. run Aphrodite first, and then run HF.
|
|
|
+ # Aphrodite needs a fresh new process without cuda initialization.
|
|
|
+ # if we run HF first, the cuda initialization will be done and it
|
|
|
+ # will hurt multiprocessing backend with fork method (the default).
|
|
|
+
|
|
|
+ # Note: currently encoder/decoder models are only compatible with
|
|
|
+ # enforce_eager=True. Normally this is not a problem because
|
|
|
+ # for encoder/decoder models Aphrodite will
|
|
|
+ # default to enforce_eager=True if enforce_eager
|
|
|
+ # is left unspecified. However, the
|
|
|
+ # AphroditeRunner test fixture (which wraps around the LLM class) defaults to
|
|
|
+ # enforce_eager=False (a behavior which a number of already-exisitng
|
|
|
+ # decoder-only unit tests expect), so when testing an encoder/decoder
|
|
|
+ # model we must explicitly specify enforce_eager=True in the AphroditeRunner
|
|
|
+ # constructor.
|
|
|
+ with aphrodite_runner(
|
|
|
+ model,
|
|
|
+ dtype=dtype,
|
|
|
+ tensor_parallel_size=tensor_parallel_size,
|
|
|
+ distributed_executor_backend=distributed_executor_backend,
|
|
|
+ enforce_eager=True) as aphrodite_model:
|
|
|
+ aphrodite_outputs = aphrodite_model.generate_encoder_decoder_greedy_logprobs(
|
|
|
+ prompts, max_tokens, num_logprobs)
|
|
|
|
|
|
# Configuration settings for HF baseline
|
|
|
hf_kwargs = {
|
|
@@ -137,28 +156,12 @@ if not is_cpu():
|
|
|
auto_cls=AutoModelForSeq2SeqLM) as hf_model:
|
|
|
hf_outputs = (
|
|
|
hf_model.generate_encoder_decoder_greedy_logprobs_limit(
|
|
|
- test_case_prompts,
|
|
|
+ prompts,
|
|
|
max_tokens,
|
|
|
num_logprobs,
|
|
|
**hf_kwargs,
|
|
|
))
|
|
|
|
|
|
- # Note: currently encoder/decoder models are only compatible with
|
|
|
- # enforce_eager=True. Normally this is not a problem because
|
|
|
- # for encoder/decoder models Aphrodite will
|
|
|
- # default to enforce_eager=True if enforce_eager
|
|
|
- # is left unspecified. However, the
|
|
|
- # AphroditeRunner test fixture (which wraps around the LLM class)
|
|
|
- # defaults to enforce_eager=False (a behavior which a number of
|
|
|
- # already-exisitng decoder-only unit tests expect), so when testing
|
|
|
- # an encoder/decoder model we must explicitly specify enforce_eager=True
|
|
|
- # in the AphroditeRunner constructor.
|
|
|
- with aphrodite_runner(model, dtype=dtype,
|
|
|
- enforce_eager=True) as aphrodite_model:
|
|
|
- aphrodite_outputs = (
|
|
|
- aphrodite_model.generate_encoder_decoder_greedy_logprobs(
|
|
|
- test_case_prompts, max_tokens, num_logprobs))
|
|
|
-
|
|
|
hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE
|
|
|
else 0)
|
|
|
|
|
@@ -172,3 +175,49 @@ if not is_cpu():
|
|
|
name_1="aphrodite",
|
|
|
num_outputs_0_skip_tokens=hf_skip_tokens,
|
|
|
)
|
|
|
+
|
|
|
+ @pytest.mark.parametrize("model", MODELS)
|
|
|
+ @pytest.mark.parametrize("dtype", ["float", "bfloat16"])
|
|
|
+ @pytest.mark.parametrize("max_tokens", [64])
|
|
|
+ @pytest.mark.parametrize("num_logprobs", [5])
|
|
|
+ @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
|
|
|
+ def test_models(hf_runner, aphrodite_runner, example_encoder_decoder_prompts,
|
|
|
+ model, dtype, max_tokens, num_logprobs,
|
|
|
+ decoder_prompt_type) -> None:
|
|
|
+
|
|
|
+ run_test(
|
|
|
+ hf_runner,
|
|
|
+ aphrodite_runner,
|
|
|
+ example_encoder_decoder_prompts[decoder_prompt_type],
|
|
|
+ decoder_prompt_type,
|
|
|
+ model,
|
|
|
+ dtype=dtype,
|
|
|
+ max_tokens=max_tokens,
|
|
|
+ num_logprobs=num_logprobs,
|
|
|
+ tensor_parallel_size=1,
|
|
|
+ )
|
|
|
+
|
|
|
+ @multi_gpu_test(num_gpus=2)
|
|
|
+ @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
|
|
|
+ @pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
|
|
|
+ @pytest.mark.parametrize("dtype", ["float"])
|
|
|
+ @pytest.mark.parametrize("max_tokens", [64])
|
|
|
+ @pytest.mark.parametrize("num_logprobs", [5])
|
|
|
+ @pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM])
|
|
|
+ def test_models_distributed(hf_runner, aphrodite_runner,
|
|
|
+ example_encoder_decoder_prompts,
|
|
|
+ distributed_executor_backend, model, dtype,
|
|
|
+ max_tokens, num_logprobs,
|
|
|
+ decoder_prompt_type) -> None:
|
|
|
+ run_test(
|
|
|
+ hf_runner,
|
|
|
+ aphrodite_runner,
|
|
|
+ example_encoder_decoder_prompts[decoder_prompt_type],
|
|
|
+ decoder_prompt_type,
|
|
|
+ model,
|
|
|
+ dtype=dtype,
|
|
|
+ max_tokens=max_tokens,
|
|
|
+ num_logprobs=num_logprobs,
|
|
|
+ tensor_parallel_size=2,
|
|
|
+ distributed_executor_backend=distributed_executor_backend,
|
|
|
+ )
|