123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181 |
- import pytest
- from aphrodite.worker.model_runner import _get_graph_batch_size
- from tests.models.utils import check_outputs_equal
- MODELS = ["ai21labs/Jamba-tiny-random"]
- # Fails due to usage of MoE as MLP(E=1_, which is different than the HF impl
- # TODO: Fix this with trained model
- @pytest.mark.skip()
- @pytest.mark.parametrize("model", MODELS)
- @pytest.mark.parametrize("dtype", ["bfloat16"])
- @pytest.mark.parametrize("max_tokens", [10])
- def test_models(
- hf_runner,
- aphrodite_runner,
- example_prompts,
- model: str,
- dtype: str,
- max_tokens: int,
- ) -> None:
- with hf_runner(model, dtype=dtype) as hf_model:
- hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
- with aphrodite_runner(model, dtype=dtype) as aphrodite_model:
- aphrodite_outputs = aphrodite_model.generate_greedy(
- example_prompts, max_tokens)
- for i in range(len(example_prompts)):
- hf_output_ids, hf_output_str = hf_outputs[i]
- aphrodite_output_ids, aphrodite_output_str = aphrodite_outputs[i]
- assert hf_output_str == aphrodite_output_str, (
- f"Test{i}:\nHF: {hf_output_str!r}\nAphrodite: "
- f"{aphrodite_output_str!r}")
- assert hf_output_ids == aphrodite_output_ids, (
- f"Test{i}:\nHF: {hf_output_ids}\nAphrodite: {aphrodite_output_ids}")
- @pytest.mark.parametrize("model", MODELS)
- @pytest.mark.parametrize("dtype", ["half"])
- @pytest.mark.parametrize("max_tokens", [5])
- def test_batching(
- aphrodite_runner,
- example_prompts,
- model: str,
- dtype: str,
- max_tokens: int,
- ) -> None:
- # To pass the small model tests, we need full precision.
- for_loop_outputs = []
- with aphrodite_runner(model, dtype=dtype) as aphrodite_model:
- for prompt in example_prompts:
- for_loop_outputs.append(
- aphrodite_model.generate_greedy([prompt], max_tokens)[0])
- batched_outputs = aphrodite_model.generate_greedy(example_prompts,
- max_tokens)
- check_outputs_equal(
- outputs_0_lst=for_loop_outputs,
- outputs_1_lst=batched_outputs,
- name_0="for_loop_aphrodite",
- name_1="batched_aphrodite",
- )
- @pytest.mark.parametrize("model", MODELS)
- @pytest.mark.parametrize("dtype", ["bfloat16"])
- @pytest.mark.parametrize("max_tokens", [20])
- def test_mamba_cache_cg_padding(
- aphrodite_runner,
- example_prompts,
- model: str,
- dtype: str,
- max_tokens: int,
- ) -> None:
- # This test is for verifying that mamba cache is padded to CG captured
- # batch size. If it's not, a torch RuntimeError will be raised because
- # tensor dimensions aren't compatible
- while len(example_prompts) == _get_graph_batch_size(len(example_prompts)):
- example_prompts.append(example_prompts[0])
- try:
- with aphrodite_runner(model, dtype=dtype) as aphrodite_model:
- aphrodite_model.generate_greedy(example_prompts, max_tokens)
- except RuntimeError:
- pytest.fail(
- "Couldn't run batch size which is not equal to a Cuda Graph "
- "captured batch size. "
- "Could be related to mamba cache not padded correctly")
- @pytest.mark.parametrize("model", MODELS)
- @pytest.mark.parametrize("dtype", ["float"])
- @pytest.mark.parametrize("max_tokens", [20])
- def test_models_preemption_recompute(
- hf_runner,
- aphrodite_runner,
- example_prompts,
- model: str,
- dtype: str,
- max_tokens: int,
- ) -> None:
- # Tests that outputs are identical with and w/o preemtions (recompute)
- assert dtype == "float"
- with aphrodite_runner(model, dtype=dtype) as aphrodite_model:
- aphrodite_model.model.llm_engine.scheduler[
- 0].ENABLE_ARTIFICIAL_PREEMPT = True
- preempt_aphrodite_outputs = aphrodite_model.generate_greedy(
- example_prompts, max_tokens)
- aphrodite_model.model.llm_engine.scheduler[
- 0].ENABLE_ARTIFICIAL_PREEMPT = False
- aphrodite_outputs = aphrodite_model.generate_greedy(
- example_prompts, max_tokens)
- check_outputs_equal(
- outputs_0_lst=preempt_aphrodite_outputs,
- outputs_1_lst=aphrodite_outputs,
- name_0="aphrodite_preepmtions",
- name_1="aphrodite",
- )
- @pytest.mark.parametrize("model", MODELS)
- @pytest.mark.parametrize("dtype", ["float"])
- def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
- aphrodite_runner,
- model: str,
- dtype: str,
- example_prompts,
- ) -> None:
- # This test is for verifying that the Jamba inner state management doesn't
- # collapse in case where the number of incoming requests and
- # finished_requests_ids is larger than the maximum mamba block capacity.
- # This could generally happen due to the fact that Jamba does support
- # statelessness mechanism where it can cleanup new incoming requests in
- # a single step.
- try:
- with aphrodite_runner(model, dtype=dtype,
- max_num_seqs=10) as aphrodite_model:
- aphrodite_model.generate_greedy([example_prompts[0]] * 100, 10)
- except ValueError:
- pytest.fail("Jamba inner state wasn't cleaned up properly between"
- "steps finished requests registered unnecessarily ")
- @pytest.mark.parametrize("model", MODELS)
- @pytest.mark.parametrize("dtype", ["float"])
- def test_state_cleanup(
- aphrodite_runner,
- model: str,
- dtype: str,
- example_prompts,
- ) -> None:
- # This test is for verifying that the Jamba state is cleaned up between
- # steps, If its not cleaned, an error would be expected.
- try:
- with aphrodite_runner(model, dtype=dtype) as aphrodite_model:
- for _ in range(10):
- aphrodite_model.generate_greedy([example_prompts[0]] * 100, 1)
- except ValueError:
- pytest.fail("Jamba inner state wasn't cleaned up between states, "
- "could be related to finished_requests_ids")
- @pytest.mark.parametrize("model", MODELS)
- @pytest.mark.parametrize("dtype", ["float"])
- def test_model_print(
- aphrodite_runner,
- model: str,
- dtype: str,
- ) -> None:
- with aphrodite_runner(model, dtype=dtype) as aphrodite_model:
- # This test is for verifying whether the model's extra_repr
- # can be printed correctly.
- print(aphrodite_model.model.llm_engine.model_executor.driver_worker.
- model_runner.model)
|