test_experts_int8.py 849 B

12345678910111213141516171819202122232425262728
  1. # flake8: noqa
  2. """Tests experts_int8 quantization startup and generation,
  3. doesn't test correctness
  4. """
  5. import pytest
  6. from tests.quantization.utils import is_quant_method_supported
  7. MODELS = ["ai21labs/Jamba-tiny-random"]
  8. @pytest.mark.skipif(not is_quant_method_supported("experts_int8"),
  9. reason="ExpertsInt8 is not supported on this GPU type.")
  10. @pytest.mark.parametrize("model", MODELS)
  11. @pytest.mark.parametrize("dtype", ["bfloat16"])
  12. @pytest.mark.parametrize("max_tokens", [10])
  13. def test_model_experts_int8_startup(
  14. hf_runner,
  15. aphrodite_runner,
  16. example_prompts,
  17. model: str,
  18. dtype: str,
  19. max_tokens: int,
  20. ) -> None:
  21. with aphrodite_runner(model, dtype=dtype,
  22. quantization="experts_int8") as aphrodite_model:
  23. aphrodite_model.generate_greedy(example_prompts, max_tokens)