소스 검색

tests: refactor model tests (#1078)

* tests: refactor model tests

* typo
AlpinDale 2 달 전
부모
커밋
7b6501bd05
59개의 변경된 파일1239개의 추가작업 그리고 946개의 파일을 삭제
  1. 24 3
      aphrodite/transformers_utils/tokenizers/mistral.py
  2. 67 2
      tests/basic_correctness/test_basic_correctness.py
  3. 68 38
      tests/basic_correctness/test_chunked_prefill.py
  4. 14 11
      tests/basic_correctness/test_preemption.py
  5. 23 22
      tests/conftest.py
  6. 0 81
      tests/distributed/test_basic_distributed_correctness.py
  7. 0 103
      tests/distributed/test_basic_distributed_correctness_enc_dec.py
  8. 0 75
      tests/distributed/test_chunked_prefill_distributed.py
  9. 0 59
      tests/distributed/test_multimodal_broadcast.py
  10. 7 7
      tests/distributed/test_same_node.py
  11. 6 3
      tests/kernels/utils.py
  12. 0 0
      tests/models/decoder_only/__init__.py
  13. 0 0
      tests/models/decoder_only/audio_language/__init__.py
  14. 3 6
      tests/models/decoder_only/audio_language/test_ultravox.py
  15. 0 0
      tests/models/decoder_only/language/__init__.py
  16. 2 22
      tests/models/decoder_only/language/test_aqlm.py
  17. 11 9
      tests/models/decoder_only/language/test_big_models.py
  18. 2 3
      tests/models/decoder_only/language/test_danube3_4b.py
  19. 99 0
      tests/models/decoder_only/language/test_fp8.py
  20. 20 6
      tests/models/decoder_only/language/test_gguf.py
  21. 2 2
      tests/models/decoder_only/language/test_gptq_marlin.py
  22. 2 1
      tests/models/decoder_only/language/test_gptq_marlin_24.py
  23. 9 6
      tests/models/decoder_only/language/test_granite.py
  24. 7 10
      tests/models/decoder_only/language/test_jamba.py
  25. 1 1
      tests/models/decoder_only/language/test_marlin.py
  26. 175 0
      tests/models/decoder_only/language/test_mistral.py
  27. 79 0
      tests/models/decoder_only/language/test_modelopt.py
  28. 2 3
      tests/models/decoder_only/language/test_models.py
  29. 112 0
      tests/models/decoder_only/language/test_phimoe.py
  30. 0 0
      tests/models/decoder_only/vision_language/__init__.py
  31. 5 8
      tests/models/decoder_only/vision_language/test_blip2.py
  32. 42 0
      tests/models/decoder_only/vision_language/test_broadcast.py
  33. 6 8
      tests/models/decoder_only/vision_language/test_chameleon.py
  34. 5 8
      tests/models/decoder_only/vision_language/test_fuyu.py
  35. 77 0
      tests/models/decoder_only/vision_language/test_intern_vit.py
  36. 7 7
      tests/models/decoder_only/vision_language/test_internvl.py
  37. 7 9
      tests/models/decoder_only/vision_language/test_llava.py
  38. 4 6
      tests/models/decoder_only/vision_language/test_llava_image_embeds.py
  39. 6 8
      tests/models/decoder_only/vision_language/test_llava_next.py
  40. 62 84
      tests/models/decoder_only/vision_language/test_llava_next_video.py
  41. 37 107
      tests/models/decoder_only/vision_language/test_minicpmv.py
  42. 6 8
      tests/models/decoder_only/vision_language/test_paligemma.py
  43. 6 9
      tests/models/decoder_only/vision_language/test_phi3v.py
  44. 54 22
      tests/models/decoder_only/vision_language/test_pixtral.py
  45. 33 5
      tests/models/decoder_only/vision_language/test_qwen.py
  46. 0 0
      tests/models/embedding/__init__.py
  47. 0 0
      tests/models/embedding/language/__init__.py
  48. 1 1
      tests/models/embedding/language/test_embedding.py
  49. 0 0
      tests/models/encoder_decoder/__init__.py
  50. 0 0
      tests/models/encoder_decoder/language/__init__.py
  51. 102 53
      tests/models/encoder_decoder/language/test_bart.py
  52. 0 0
      tests/models/fixtures/pixtral_chat.json
  53. BIN
      tests/models/fixtures/pixtral_chat.pickle
  54. 0 0
      tests/models/fixtures/pixtral_chat_engine.json
  55. BIN
      tests/models/fixtures/pixtral_chat_engine.pickle
  56. 0 118
      tests/models/test_fp8.py
  57. 5 0
      tests/models/test_registry.py
  58. 4 1
      tests/models/utils.py
  59. 35 11
      tests/utils.py

+ 24 - 3
aphrodite/transformers_utils/tokenizers/mistral.py

@@ -177,10 +177,27 @@ class MistralTokenizer:
 
     def convert_tokens_to_string(self, tokens: List[str]) -> str:
         if isinstance(self.tokenizer, Tekkenizer):
-            return "".join(t for t in tokens
-                           if t not in self.tokenizer._all_special_tokens)
+            tokens = [
+                t for t in tokens
+                if t not in self.tokenizer._all_special_tokens
+            ]
+            if any(isinstance(t, bytes) for t in tokens):
+                # we need to encode and decode all tokens again
+                shift = self.tokenizer.num_special_tokens
+                byte_tokens = [
+                    t.encode("utf-8") if not isinstance(t, bytes) else t
+                    for t in tokens
+                ]
+                ids = [
+                    self.tokenizer._tekken_token2id_nospecial[t] + shift
+                    for t in byte_tokens
+                ]
+                decoded = self.tokenizer.decode(ids)
+            else:
+                decoded = "".join(tokens)
         else:
-            return self.tokenizer.decode(tokens)  # type: ignore[arg-type]
+            decoded = self.tokenizer.decode(tokens)  # type: ignore[arg-type]
+        return decoded
 
     def decode(self, ids: Union[List[int], int]) -> str:
         if isinstance(ids, int):
@@ -204,4 +221,8 @@ class MistralTokenizer:
                               self.tokenizer)
 
         tokens = [self.tokenizer.id_to_piece(id) for id in ids]
+        if any(t.strip() == "�" for t in tokens):
+            # if any stripped decoded token is undefined
+            # because it's invalid unicode then pass bytes
+            tokens = [self.tokenizer.id_to_byte_piece(id) for id in ids]
         return tokens

+ 67 - 2
tests/basic_correctness/test_basic_correctness.py

@@ -15,12 +15,15 @@ from aphrodite.common.utils import is_hip
 from aphrodite.worker.model_runner import ModelInputForGPUWithSamplingMetadata
 
 from ..models.utils import check_outputs_equal
+from ..utils import multi_gpu_test
 
 MODELS = [
     "facebook/opt-125m",
     "meta-llama/Llama-2-7b-hf",
 ]
 
+TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
+
 
 def test_aphrodite_gc_ed():
     """Verify aphrodite instance is GC'ed when it is deleted"""
@@ -60,8 +63,68 @@ def test_models(
                      dtype=dtype,
                      enforce_eager=enforce_eager,
                      gpu_memory_utilization=0.7) as aphrodite_model:
-        aphrodite_outputs = aphrodite_model.generate_greedy(example_prompts,
-                                                            max_tokens)
+        aphrodite_outputs = aphrodite_model.generate_greedy(
+            example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=aphrodite_outputs,
+        name_0="hf",
+        name_1="aphrodite",
+    )
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "model, distributed_executor_backend, attention_backend, "
+    "test_suite", [
+        ("facebook/opt-125m", "ray", "", "L4"),
+        ("facebook/opt-125m", "mp", "", "L4"),
+        ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
+        ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
+        ("facebook/opt-125m", "ray", "", "A100"),
+        ("facebook/opt-125m", "mp", "", "A100"),
+        ("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
+        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
+    ])
+def test_models_distributed(
+    hf_runner,
+    aphrodite_runner,
+    example_prompts,
+    model: str,
+    distributed_executor_backend: str,
+    attention_backend: str,
+    test_suite: str,
+) -> None:
+
+    if test_suite != TARGET_TEST_SUITE:
+        pytest.skip(f"Skip test for {test_suite}")
+
+    if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
+        # test ray adag
+        os.environ['APHRODITE_USE_RAY_SPMD_WORKER'] = "1"
+        os.environ['APHRODITE_USE_RAY_COMPILED_DAG'] = "1"
+
+    if attention_backend:
+        os.environ["APHRODITE_ATTENTION_BACKEND"] = attention_backend
+
+    dtype = "half"
+    max_tokens = 5
+
+    # NOTE: take care of the order. run Aphrodite first, and then run HF.
+    # Aphrodite needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    with aphrodite_runner(model,
+                     dtype=dtype,
+                     tensor_parallel_size=2,
+                     distributed_executor_backend=distributed_executor_backend
+                     ) as aphrodite_model:
+        aphrodite_outputs = aphrodite_model.generate_greedy(
+            example_prompts, max_tokens)
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
     check_outputs_equal(
         outputs_0_lst=hf_outputs,
@@ -84,8 +147,10 @@ def test_model_with_failure(aphrodite_runner) -> None:
                                 str(exc_info.value))
             assert matches is not None
             filename = f"{matches.group(1)}.pkl"
+
         with open(filename, "rb") as filep:
             inputs = pickle.load(filep)
+
         if any(key not in inputs for key in ("arg_1", "arg_2", "arg_3")):
             raise AssertionError("Missing keys in dumped inputs. Dumped keys: "
                                  f"{list(inputs.keys())}")

+ 68 - 38
tests/basic_correctness/test_chunked_prefill.py

@@ -6,28 +6,18 @@ prefill requests are chunked.
 
 Run `pytest tests/models/test_chunked_prefill.py`.
 """
+import os
 from contextlib import nullcontext
 
 import pytest
 
 from ..models.utils import check_logprobs_close, check_outputs_equal
+from ..utils import multi_gpu_test
 
 MODELS = [
     "facebook/opt-125m",
     "meta-llama/Llama-2-7b-hf",
 ]
-E5M2_KV_MODELS = [
-    "facebook/opt-125m",
-    "meta-llama/Llama-2-7b-chat-hf",
-]
-E4M3_KV_MODELS = [
-    "meta-llama/Llama-2-7b-chat-hf", "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
-    "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
-]
-KV_CACHE_QUANTIZATION_PATHS = {
-    "meta-llama/Llama-2-7b-chat-hf":
-    "./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json"
-}
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -68,8 +58,62 @@ def test_models(
             enforce_eager=enforce_eager,
             max_num_seqs=max_num_seqs,
     ) as aphrodite_model:
-        aphrodite_outputs = aphrodite_model.generate_greedy(example_prompts,
-                                                            max_tokens)
+        aphrodite_outputs = aphrodite_model.generate_greedy(
+            example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=aphrodite_outputs,
+        name_0="hf",
+        name_1="aphrodite",
+    )
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
+@pytest.mark.parametrize("model", MODELS)
+def test_models_distributed(
+    hf_runner,
+    aphrodite_runner,
+    example_prompts,
+    model: str,
+    distributed_executor_backend: str,
+) -> None:
+    if (model == "meta-llama/Llama-2-7b-hf"
+            and distributed_executor_backend == "ray"):
+        # test ray adag
+        os.environ['APHRODITE_USE_RAY_SPMD_WORKER'] = "1"
+        os.environ['APHRODITE_USE_RAY_COMPILED_DAG'] = "1"
+
+    dtype = "half"
+    max_tokens = 5
+    chunked_prefill_token_size = 16
+
+    # Add a chunked prefill config.
+    max_num_seqs = min(chunked_prefill_token_size, 256)
+    assert chunked_prefill_token_size != -1
+    enable_chunked_prefill = True
+    max_num_batched_tokens = chunked_prefill_token_size
+
+    # NOTE: take care of the order. run Aphrodite first, and then run HF.
+    # Aphrodite needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+
+    with aphrodite_runner(
+            model,
+            dtype=dtype,
+            tensor_parallel_size=2,
+            max_num_seqs=max_num_seqs,
+            enable_chunked_prefill=enable_chunked_prefill,
+            max_num_batched_tokens=max_num_batched_tokens,
+            distributed_executor_backend=distributed_executor_backend,
+    ) as aphrodite_model:
+        aphrodite_outputs = aphrodite_model.generate_greedy(
+            example_prompts, max_tokens)
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
     check_outputs_equal(
         outputs_0_lst=hf_outputs,
@@ -79,13 +123,13 @@ def test_models(
     )
 
 
-@pytest.mark.parametrize("kv_cache_dtype,model",
-                         [("fp8_e5m2", m)
-                          for m in E5M2_KV_MODELS] + [("fp8_e4m3", m)
-                                                      for m in E4M3_KV_MODELS])
+@pytest.mark.parametrize(
+    "kv_cache_dtype,model",
+    [("fp8_e4m3",
+      "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme")])
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
 @pytest.mark.parametrize("max_tokens", [4])
-@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
+@pytest.mark.parametrize("chunked_prefill_token_size", [4, 16])
 @pytest.mark.parametrize("enforce_eager", [False, True])
 # NOTE: Increasing this in this suite will fail CI because we currently cannot
 # reset distributed env properly. Use a value > 1 just when you test.
@@ -105,27 +149,15 @@ def test_models_with_fp8_kv_cache(
     disable_async_output_proc: bool,
 ) -> None:
     """
-    Only checks log probs match between chunked-prefill and
-    non-chunked-prefill version of Aphrodite model runner.
-    
-    This test is used when there is discrepancy in kernels
-    / numerics (e.g. when using lower-precision types like FP8).
+    Check output logprobs match between no_chunked_prefill and chunked_prefill
+    with fp8 kv cache. General fp8 kv-cache tests are covered in test_fp8.py,
+    so here we only check chunked prefill.
     """
     NUM_LOG_PROBS = 8
 
-    if model == "facebook/opt-125m":
-        pytest.skip(
-            "#7378: CUDA illegal memory access (undiagnosed) facebook/opt-125m"
-        )
-
     max_num_seqs = chunked_prefill_token_size
     max_num_batched_tokens = chunked_prefill_token_size
 
-    extra_kwargs = {}
-    if model in KV_CACHE_QUANTIZATION_PATHS:
-        extra_kwargs["quantization_param_path"] = KV_CACHE_QUANTIZATION_PATHS[
-            model]
-
     with aphrodite_runner(
             model,
             tensor_parallel_size=tensor_parallel_size,
@@ -133,7 +165,6 @@ def test_models_with_fp8_kv_cache(
             max_num_seqs=max_num_seqs,
             kv_cache_dtype=kv_cache_dtype,
             disable_async_output_proc=disable_async_output_proc,
-            **extra_kwargs,
     ) as aphrodite_model:
         no_chunked_prefill_outputs = aphrodite_model.generate_greedy_logprobs(
             example_prompts, max_tokens, NUM_LOG_PROBS)
@@ -147,7 +178,6 @@ def test_models_with_fp8_kv_cache(
             max_num_seqs=max_num_seqs,
             kv_cache_dtype=kv_cache_dtype,
             disable_async_output_proc=disable_async_output_proc,
-            **extra_kwargs,
     ) as aphrodite_model:
         chunked_prefill_outputs = aphrodite_model.generate_greedy_logprobs(
             example_prompts, max_tokens, NUM_LOG_PROBS)
@@ -212,8 +242,8 @@ def test_with_prefix_caching(
             # Send the request one-by-one to ensure the cache is populated.
             with pytest.raises(ValueError) if should_fail else nullcontext():
                 for prompt in full_prompts:
-                    outputs[enable] += aphrodite_model.generate_greedy(
-                        [prompt], max_tokens)
+                    outputs[enable] += aphrodite_model.generate_greedy([prompt],
+                                                                  max_tokens)
 
     # Check results only if we did not expect a failure.
     if check_result:

+ 14 - 11
tests/basic_correctness/test_preemption.py

@@ -9,8 +9,8 @@ pytest tests/basic_correctness/test_preemption.py`.
 import pytest
 from prometheus_client import REGISTRY
 
+import aphrodite.common.envs as envs
 from aphrodite import SamplingParams
-from aphrodite.executor.ray_gpu_executor import APHRODITE_USE_RAY_SPMD_WORKER
 from aphrodite.processing.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
                                             ENABLE_ARTIFICIAL_PREEMPT)
 
@@ -20,17 +20,20 @@ MODELS = [
     "facebook/opt-125m",
 ]
 
-assert ENABLE_ARTIFICIAL_PREEMPT is True, (
-    "Use an env var APHRODITE_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
-    "`APHRODITE_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
-    "tests/basic_correctness/test_preemption.py`")
+
+@pytest.fixture(scope="module", autouse=True)
+def check_settings():
+    assert ENABLE_ARTIFICIAL_PREEMPT is True, (
+        "Use an env var APHRODITE_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
+        "`APHRODITE_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
+        "tests/basic_correctness/test_preemption.py`")
 
 
 @pytest.fixture
 def worker_use_ray() -> bool:
     # When SPMD worker is used, use ray_use_worker=True
     # to test delta input optimization works with preemption.
-    return APHRODITE_USE_RAY_SPMD_WORKER
+    return envs.APHRODITE_USE_RAY_SPMD_WORKER
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -65,9 +68,10 @@ def test_chunked_prefill_recompute(
             enable_chunked_prefill=enable_chunked_prefill,
             max_num_seqs=max_num_seqs,
             worker_use_ray=worker_use_ray,
+            disable_log_stats=False,
     ) as aphrodite_model:
-        aphrodite_outputs = aphrodite_model.generate_greedy(example_prompts,
-                                                            max_tokens)
+        aphrodite_outputs = aphrodite_model.generate_greedy(
+            example_prompts, max_tokens)
         assert (
             aphrodite_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
                 < ARTIFICIAL_PREEMPTION_MAX_CNT)
@@ -106,8 +110,8 @@ def test_preemption(
             disable_log_stats=False,
             worker_use_ray=worker_use_ray,
     ) as aphrodite_model:
-        aphrodite_outputs = aphrodite_model.generate_greedy(example_prompts,
-                                                            max_tokens)
+        aphrodite_outputs = aphrodite_model.generate_greedy(
+            example_prompts, max_tokens)
         assert (
             aphrodite_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
                 < ARTIFICIAL_PREEMPTION_MAX_CNT)
@@ -216,7 +220,6 @@ def test_swap_infeasible(
     prefill_blocks = 2
     decode_blocks = max_tokens // BLOCK_SIZE
     example_prompts = example_prompts[:1]
-
     with aphrodite_runner(
             model,
             dtype=dtype,

+ 23 - 22
tests/conftest.py

@@ -6,8 +6,8 @@ import sys
 import tempfile
 from collections import UserList
 from enum import Enum
-from typing import (Any, Callable, Dict, List, Optional, Tuple, TypedDict,
-                    TypeVar, Union)
+from typing import (Any, Callable, Dict, List, Optional, Tuple, Type,
+                    TypedDict, TypeVar, Union)
 
 import numpy as np
 import pytest
@@ -15,10 +15,10 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from huggingface_hub import snapshot_download
-from loguru import logger
 from PIL import Image
 from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
                           BatchFeature)
+from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
 from aphrodite import LLM, SamplingParams
 from aphrodite.assets.image import ImageAsset
@@ -89,18 +89,25 @@ class _ImageAssets(_ImageAssetsBase):
 
 class _VideoAssetPrompts(TypedDict):
     sample_demo_1: str
+
+
 if sys.version_info < (3, 9):
     # UserList cannot be subscripted
     class _VideoAssetsBase(UserList):
         pass
 else:
+
     class _VideoAssetsBase(UserList[VideoAsset]):
         pass
+
+
 class _VideoAssets(_VideoAssetsBase):
+
     def __init__(self) -> None:
         super().__init__([
             VideoAsset("sample_demo_1.mp4"),
         ])
+
     def prompts(self, prompts: _VideoAssetPrompts) -> List[str]:
         return [prompts["sample_demo_1"]]
 
@@ -150,10 +157,7 @@ def should_do_global_cleanup_after_test(request) -> bool:
     to initialize torch.
     """
 
-    if request.node.get_closest_marker("skip_global_cleanup"):
-        return False
-
-    return True
+    return not request.node.get_closest_marker("skip_global_cleanup")
 
 
 @pytest.fixture(autouse=True)
@@ -253,7 +257,7 @@ class HfRunner:
         *,
         model_kwargs: Optional[Dict[str, Any]] = None,
         is_embedding_model: bool = False,
-        auto_cls=AutoModelForCausalLM,
+        auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM,
         postprocess_inputs: Callable[[BatchEncoding],
                                      BatchEncoding] = identity,
     ) -> None:
@@ -285,20 +289,14 @@ class HfRunner:
             trust_remote_code=True,
         )
 
-        try:
-            # don't put this import at the top level
-            # it will call torch.cuda.device_count()
-            from transformers import AutoProcessor  # noqa: F401
-            self.processor = AutoProcessor.from_pretrained(
-                model_name,
-                torch_dtype=torch_dtype,
-                trust_remote_code=True,
-            )
-        except Exception as exc:
-            logger.warning(
-                f"Unable to auto-load HuggingFace processor for model "
-                f"({model_name}). Using tokenizer instead. Reason: {exc}")
-            self.processor = self.tokenizer
+        # don't put this import at the top level
+        # it will call torch.cuda.device_count()
+        from transformers import AutoProcessor  # noqa: F401
+        self.processor = AutoProcessor.from_pretrained(
+            model_name,
+            torch_dtype=torch_dtype,
+            trust_remote_code=True,
+        )
 
         self.postprocess_inputs = postprocess_inputs
 
@@ -682,6 +680,7 @@ class AphroditeRunner:
 
         if videos is not None:
             assert len(prompts) == len(videos)
+
         inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
         if images is not None:
             for i, image in enumerate(images):
@@ -698,6 +697,7 @@ class AphroditeRunner:
 
         req_outputs = self.model.generate(inputs,
                                           sampling_params=sampling_params)
+
         toks_str_logsprobs_prompt_logprobs = (
             self._final_steps_generate_w_logprobs(req_outputs))
         # Omit prompt logprobs if not required by sampling params
@@ -754,6 +754,7 @@ class AphroditeRunner:
             logprobs=num_logprobs,
             prompt_logprobs=(num_prompt_logprobs),
             stop_token_ids=stop_token_ids)
+
         return self.generate_w_logprobs(prompts,
                                         greedy_logprobs_params,
                                         images=images,

+ 0 - 81
tests/distributed/test_basic_distributed_correctness.py

@@ -1,81 +0,0 @@
-"""Compare the outputs of HF and distributed Aphrodite when using
-greedy sampling.
-
-Run:
-```sh
-cd $APHRODITE_PATH/tests
-
-pytest distributed/test_basic_distributed_correctness.py
-```
-"""
-import os
-
-import pytest
-
-from aphrodite.common.utils import cuda_device_count_stateless
-
-from ..models.utils import check_outputs_equal
-from ..utils import fork_new_process_for_each_test
-
-TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
-
-
-@pytest.mark.skipif(cuda_device_count_stateless() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize(
-    "model, distributed_executor_backend, attention_backend, test_suite", [
-        ("facebook/opt-125m", "ray", "", "L4"),
-        ("facebook/opt-125m", "mp", "", "L4"),
-        ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
-        ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
-        ("facebook/opt-125m", "ray", "", "A100"),
-        ("facebook/opt-125m", "mp", "", "A100"),
-        ("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
-        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
-    ])
-@fork_new_process_for_each_test
-def test_models(
-    hf_runner,
-    aphrodite_runner,
-    example_prompts,
-    model: str,
-    distributed_executor_backend: str,
-    attention_backend: str,
-    test_suite: str,
-) -> None:
-
-    if test_suite != TARGET_TEST_SUITE:
-        pytest.skip(f"Skip test for {test_suite}")
-
-    if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
-        # test ray adag
-        os.environ['APHRODITE_USE_RAY_SPMD_WORKER'] = "1"
-        os.environ['APHRODITE_USE_RAY_COMPILED_DAG'] = "1"
-
-    if attention_backend:
-        os.environ["APHRODITE_ATTENTION_BACKEND"] = attention_backend
-
-    dtype = "half"
-    max_tokens = 5
-
-    # NOTE: take care of the order. run Aphrodite first, and then run HF.
-    # Aphrodite needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-    with aphrodite_runner(model,
-                     dtype=dtype,
-                     tensor_parallel_size=2,
-                     distributed_executor_backend=distributed_executor_backend
-                     ) as aphrodite_model:
-        aphrodite_outputs = aphrodite_model.generate_greedy(
-            example_prompts, max_tokens)
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=aphrodite_outputs,
-        name_0="hf",
-        name_1="aphrodite",
-    )

+ 0 - 103
tests/distributed/test_basic_distributed_correctness_enc_dec.py

@@ -1,103 +0,0 @@
-"""For encoder/decoder models only:
-Compare the outputs of HF and distributed Aphrodite when using greedy sampling.
-
-Run:
-```sh
-cd $APHRODITE_PATH/tests
-
-pytest distributed/test_basic_distributed_correctness_enc_dec.py
-```
-"""
-
-import pytest
-from transformers import AutoModelForSeq2SeqLM
-
-from aphrodite.common.utils import cuda_device_count_stateless
-
-from ..conftest import DecoderPromptType
-from ..models.utils import check_logprobs_close
-from ..utils import fork_new_process_for_each_test
-
-
-@pytest.mark.skipif(cuda_device_count_stateless() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize("model, distributed_executor_backend", [
-    ("facebook/bart-large-cnn", "ray"),
-    ("facebook/bart-large-cnn", "mp"),
-])
-@fork_new_process_for_each_test
-def test_models(
-    model: str,
-    distributed_executor_backend: str,
-    hf_runner,
-    aphrodite_runner,
-    example_encoder_decoder_prompts,
-) -> None:
-    '''
-    Test Aphrodite BART inference on more than one GPU, comparing
-    outputs against HF as a baseline.
-
-    Fork a new process for each test, to prevent CUDA from
-    being re-initialized by successive tests within the same
-    process.
-
-    Arguments:
-
-    * model: the HF ID of the specific BART variant under test
-    * distributed_executor_backend
-    * hf_runner: HuggingFace (HF) test model runner
-    * aphrodite_runner: Aphrodite test model runner
-    * example_encoder_decoder_prompts: test fixture which provides a 
-                                        dictionary of dummy prompts
-    '''
-
-    dtype = "float"
-    max_tokens = 64
-    num_logprobs = 5
-
-    # Example inputs with non-trivial (i.e. not None/empty) encoder &
-    # decoder prompts.
-    test_prompts = example_encoder_decoder_prompts[DecoderPromptType.CUSTOM]
-
-    # NOTE: take care of the order. run Aphrodite first, and then run HF.
-    # Aphrodite needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-    with aphrodite_runner(
-            model,
-            dtype=dtype,
-            tensor_parallel_size=2,
-            distributed_executor_backend=distributed_executor_backend,
-            enforce_eager=True,
-    ) as aphrodite_model:
-        aphrodite_outputs = (
-            aphrodite_model.generate_encoder_decoder_greedy_logprobs(
-                test_prompts, max_tokens, num_logprobs))
-
-    # Configuration settings for HF baseline
-    hf_kwargs = {
-        "top_k": None,
-        "num_beams": 1,
-        "repetition_penalty": 1.0,
-        "top_p": 1.0,
-        "length_penalty": 1.0,
-        "early_stopping": False,
-        "no_repeat_ngram_size": None,
-        "min_length": 0
-    }
-
-    with hf_runner(model, dtype=dtype,
-                   auto_cls=AutoModelForSeq2SeqLM) as hf_model:
-        hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
-            test_prompts,
-            max_tokens,
-            num_logprobs,
-            **hf_kwargs,
-        ))
-
-    check_logprobs_close(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=aphrodite_outputs,
-        name_0="hf",
-        name_1="aphrodite",
-    )

+ 0 - 75
tests/distributed/test_chunked_prefill_distributed.py

@@ -1,75 +0,0 @@
-"""Compare the outputs of HF and distributed Aphrodite when using greedy
-sampling.
-
-Run:
-```sh
-pytest test_chunked_prefill_distributed.py
-```
-"""
-import os
-
-import pytest
-
-from aphrodite.common.utils import cuda_device_count_stateless
-
-from ..models.utils import check_outputs_equal
-from ..utils import fork_new_process_for_each_test
-
-
-@pytest.mark.skipif(cuda_device_count_stateless() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize("model, distributed_executor_backend", [
-    ("facebook/opt-125m", "ray"),
-    ("meta-llama/Llama-2-7b-hf", "ray"),
-    ("facebook/opt-125m", "mp"),
-    ("meta-llama/Llama-2-7b-hf", "mp"),
-])
-@fork_new_process_for_each_test
-def test_models(
-    hf_runner,
-    aphrodite_runner,
-    example_prompts,
-    model: str,
-    distributed_executor_backend: str,
-) -> None:
-    if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray": # noqa
-        assert distributed_executor_backend == "ray"
-        os.environ["APHRODITE_USE_RAY_SPMD_WORKER"] = "1"
-        os.environ["APHRODITE_USE_RAY_COMPILED_DAG"] = "1"
-
-    dtype = "half"
-    max_tokens = 5
-    chunked_prefill_token_size = 16
-
-    # Add a chunked prefill config.
-    max_num_seqs = min(chunked_prefill_token_size, 256)
-    assert chunked_prefill_token_size != -1
-    enable_chunked_prefill = True
-    max_num_batched_tokens = chunked_prefill_token_size
-
-    # NOTE: take care of the order. run Aphrodite first, and then run HF.
-    # Aphrodite needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-
-    with aphrodite_runner(
-            model,
-            dtype=dtype,
-            tensor_parallel_size=2,
-            max_num_seqs=max_num_seqs,
-            enable_chunked_prefill=enable_chunked_prefill,
-            max_num_batched_tokens=max_num_batched_tokens,
-            distributed_executor_backend=distributed_executor_backend,
-    ) as aphrodite_model:
-        aphrodite_outputs = aphrodite_model.generate_greedy(
-            example_prompts, max_tokens)
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=aphrodite_outputs,
-        name_0="hf",
-        name_1="aphrodite",
-    )

+ 0 - 59
tests/distributed/test_multimodal_broadcast.py

@@ -1,59 +0,0 @@
-"""Compare the outputs of HF and distributed Aphrodite when using greedy
-sampling.
-
-Run:
-```sh
-pytest -s -v test_multimodal_broadcast.py
-```
-"""
-
-import pytest
-
-from aphrodite.common.utils import cuda_device_count_stateless
-
-from ..utils import fork_new_process_for_each_test
-
-
-@pytest.mark.skipif(cuda_device_count_stateless() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize("model, distributed_executor_backend", [
-    ("llava-hf/llava-1.5-7b-hf", "ray"),
-    ("llava-hf/llava-v1.6-mistral-7b-hf", "ray"),
-    ("facebook/chameleon-7b", "ray"),
-    ("llava-hf/llava-1.5-7b-hf", "mp"),
-    ("llava-hf/llava-v1.6-mistral-7b-hf", "mp"),
-    ("facebook/chameleon-7b", "mp"),
-])
-@fork_new_process_for_each_test
-def test_models(hf_runner, aphrodite_runner, image_assets, model: str,
-                distributed_executor_backend: str) -> None:
-
-    dtype = "half"
-    max_tokens = 5
-    num_logprobs = 5
-    tensor_parallel_size = 2
-
-    if model.startswith("llava-hf/llava-1.5"):
-        from ..models.test_llava import models, run_test
-    elif model.startswith("llava-hf/llava-v1.6"):
-        from ..models.test_llava_next import run_test  # type: ignore[no-redef]
-        from ..models.test_llava_next import models
-    elif model.startswith("facebook/chameleon"):
-        from ..models.test_chameleon import run_test  # type: ignore[no-redef]
-        from ..models.test_chameleon import models
-    else:
-        raise NotImplementedError(f"Unsupported model: {model}")
-
-    run_test(
-        hf_runner,
-        aphrodite_runner,
-        image_assets,
-        model=models[0],
-        # So that LLaVA-NeXT processor may return nested list
-        size_factors=[0.25, 0.5, 1.0],
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        tensor_parallel_size=tensor_parallel_size,
-        distributed_executor_backend=distributed_executor_backend,
-    )

+ 7 - 7
tests/distributed/test_same_node.py

@@ -1,13 +1,13 @@
 import os
 
-import torch
+import torch.distributed as dist
 
 from aphrodite.distributed.parallel_state import in_the_same_node_as
 
-torch.distributed.init_process_group(backend="gloo")
-test_result = all(
-    in_the_same_node_as(torch.distributed.group.WORLD, source_rank=0))
+if __name__ == "__main__":
+    dist.init_process_group(backend="gloo")
+    test_result = all(in_the_same_node_as(dist.group.WORLD, source_rank=0))
 
-expected = os.environ.get("APHRODITE_TEST_SAME_HOST", "1") == "1"
-assert test_result == expected, f"Expected {expected}, got {test_result}"
-print("Same node test passed!")
+    expected = os.environ.get("APHRODITE_TEST_SAME_HOST", "1") == "1"
+    assert test_result == expected, f"Expected {expected}, got {test_result}"
+    print("Same node test passed!")

+ 6 - 3
tests/kernels/utils.py

@@ -11,7 +11,6 @@ import torch
 
 from aphrodite.attention import (AttentionBackend, AttentionMetadata,
                                  AttentionType)
-from aphrodite.attention.backends.xformers import XFormersBackend
 from aphrodite.common.utils import (STR_BACKEND_ENV_VAR, STR_XFORMERS_ATTN_VAL,
                                     make_tensor_with_pad)
 
@@ -22,6 +21,7 @@ DEFAULT_OPCHECK_TEST_UTILS: Tuple[str, ...] = (
     "test_autograd_registration",
     "test_faketensor",
 )
+
 ALL_OPCHECK_TEST_UTILS: Tuple[str, ...] = (
     "test_schema",
     "test_autograd_registration",
@@ -211,8 +211,8 @@ def make_causal_mask(
 def override_backend_env_variable(mpatch: pytest.MonkeyPatch,
                                   backend_name: str) -> None:
     '''
-    Override the environment variable indicating the vLLM backend temporarily,
-    using pytest monkeypatch to ensure that the env vars get
+    Override the environment variable indicating the Aphrodite backend 
+    temporarily, using pytest monkeypatch to ensure that the env vars get
     reset once the test context exits.
 
     Arguments:
@@ -521,6 +521,9 @@ def make_backend(backend_name: str) -> AttentionBackend:
     * Backend instance
     '''
     if backend_name == STR_XFORMERS_ATTN_VAL:
+        # NOTE: xFormers backend cannot be imported for CPU and AMD GPUs.
+        from aphrodite.attention.backends.xformers import XFormersBackend
+
         return XFormersBackend()
     raise AssertionError(
         f"Unrecognized backend_name {backend_name} for unit test")

+ 0 - 0
tests/models/decoder_only/__init__.py


+ 0 - 0
tests/models/decoder_only/audio_language/__init__.py


+ 3 - 6
tests/models/test_ultravox.py → tests/models/decoder_only/audio_language/test_ultravox.py

@@ -7,10 +7,8 @@ from transformers import AutoModel, AutoTokenizer, BatchEncoding
 from aphrodite.common.sequence import SampleLogprobs
 from aphrodite.common.utils import STR_DTYPE_TO_TORCH_DTYPE
 
-from ..conftest import AphroditeRunner, HfRunner
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
+from ....conftest import AphroditeRunner, HfRunner
+from ...utils import check_logprobs_close
 
 MODEL_NAME = "fixie-ai/ultravox-v0_3"
 
@@ -166,8 +164,7 @@ def run_multi_audio_test(
 def test_models(hf_runner, aphrodite_runner, audio, dtype: str, max_tokens: int,
                 num_logprobs: int) -> None:
 
-    aphrodite_prompt = _get_prompt(1, "Describe the audio above.",
-                                   APHRODITE_PLACEHOLDER)
+    aphrodite_prompt = _get_prompt(1, "Describe the audio above.", APHRODITE_PLACEHOLDER)
     hf_prompt = _get_prompt(1, "Describe the audio above.", HF_PLACEHOLDER)
     run_test(
         hf_runner,

+ 0 - 0
tests/models/decoder_only/language/__init__.py


+ 2 - 22
tests/models/test_aqlm.py → tests/models/decoder_only/language/test_aqlm.py

@@ -7,26 +7,6 @@ import pytest
 
 from tests.quantization.utils import is_quant_method_supported
 
-# In this test we hardcode prompts and generations for the model so we don't
-# need to require the AQLM package as a dependency
-example_prompts = [
-    'Aphrodite is a high-throughput and memory-efficient inference and serving '
-    'engine for LLMs.\n',
-    'Briefly describe the major milestones in the development of artificial '
-    'intelligence from 1950 to 2020.\n',
-    'Compare and contrast artificial intelligence with human intelligence in '
-    'terms of processing information.\n',
-    'Describe the basic components of a neural network and how it can be '
-    'trained.\n',
-    'Write a short story about a robot that dreams for the first time.\n',
-    'Analyze the impact of the COVID-19 pandemic on global economic structures '
-    'and future business models.\n',
-    'Explain the cultural significance of the Mona Lisa painting, and how its '
-    'perception might vary in Western versus Eastern societies.\n',
-    "Translate the following English sentence into Japanese, French, and "
-    "Swahili: 'The early bird catches the worm.'\n"
-]
-
 # These ground truth generations were generated using `transformers==4.38.1
 # aqlm==1.1.0 torch==2.2.0`
 # and the below code:
@@ -79,8 +59,8 @@ def test_models(
 
     # loop through the prompts to compare against the ground truth generations
     for prompt_idx in range(len(example_prompts)):
-        aphrodite_output_ids, aphrodite_output_str, aphrodite_logprobs = (
-            aphrodite_outputs[prompt_idx])
+        aphrodite_output_ids, aphrodite_output_str, aphrodite_logprobs = aphrodite_outputs[
+            prompt_idx]
 
         print("Prompt:          ", repr(example_prompts[prompt_idx]))
         print("Reference output:", repr(ground_truth_generations[prompt_idx]))

+ 11 - 9
tests/models/test_big_models.py → tests/models/decoder_only/language/test_big_models.py

@@ -5,9 +5,10 @@ This tests bigger models and use half precision.
 Run `pytest tests/models/test_big_models.py`.
 """
 import pytest
-import torch
 
-from .utils import check_outputs_equal
+from aphrodite.platforms import current_platform
+
+from ...utils import check_outputs_equal
 
 MODELS = [
     "meta-llama/Llama-2-7b-hf",
@@ -19,10 +20,12 @@ MODELS = [
     # "Qwen/Qwen1.5-0.5B"  # Broken,
 ]
 
+if not current_platform.is_cpu():
+    # MiniCPM requires fused_moe which is not supported by CPU
+    MODELS.append("openbmb/MiniCPM3-4B")
+
 #TODO: remove this after CPU float16 support ready
-target_dtype = "float"
-if torch.cuda.is_available():
-    target_dtype = "half"
+target_dtype = "float" if current_platform.is_cpu() else "half"
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -39,9 +42,8 @@ def test_models(
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    with aphrodite_runner(model, dtype=dtype) as aphrodite_model:
-        aphrodite_outputs = aphrodite_model.generate_greedy(
-            example_prompts, max_tokens)
+    with aphrodite_runner(model, dtype=dtype, enforce_eager=True) as aphrodite_model:
+        aphrodite_outputs = aphrodite_model.generate_greedy(example_prompts, max_tokens)
 
     check_outputs_equal(
         outputs_0_lst=hf_outputs,
@@ -58,7 +60,7 @@ def test_model_print(
     model: str,
     dtype: str,
 ) -> None:
-    with aphrodite_runner(model, dtype=dtype) as aphrodite_model:
+    with aphrodite_runner(model, dtype=dtype, enforce_eager=True) as aphrodite_model:
         # This test is for verifying whether the model's extra_repr
         # can be printed correctly.
         print(aphrodite_model.model.llm_engine.model_executor.driver_worker.

+ 2 - 3
tests/models/test_danube3_4b.py → tests/models/decoder_only/language/test_danube3_4b.py

@@ -6,7 +6,7 @@ Run `pytest tests/models/test_danube3_4b.py`.
 """
 import pytest
 
-from .utils import check_outputs_equal
+from ...utils import check_outputs_equal
 
 MODELS = ["h2oai/h2o-danube3-4b-base"]
 
@@ -28,8 +28,7 @@ def test_models(
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
     with aphrodite_runner(model, dtype=dtype) as aphrodite_model:
-        aphrodite_outputs = aphrodite_model.generate_greedy(
-            example_prompts, max_tokens)
+        aphrodite_outputs = aphrodite_model.generate_greedy(example_prompts, max_tokens)
 
     check_outputs_equal(
         outputs_0_lst=hf_outputs,

+ 99 - 0
tests/models/decoder_only/language/test_fp8.py

@@ -0,0 +1,99 @@
+# flake8: noqa
+"""Tests fp8 models against ground truth generation
+Note: these tests will only pass on L4 GPU.
+"""
+import os
+from typing import Optional
+
+import pytest
+
+from tests.kernels.utils import override_backend_env_variable
+from tests.quantization.utils import is_quant_method_supported
+
+from ...utils import check_logprobs_close
+
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+                    reason="fp8 is not supported on this GPU type.")
+@pytest.mark.parametrize(
+    "kv_cache_dtype,base_model,test_model,scale_path",
+    [
+        # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
+        ("fp8_e4m3", "meta-llama/Meta-Llama-3-8B-Instruct",
+         "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV", None),
+        # Test FP16 checkpoint w. fp8_e5m2 kv-cache.
+        ("fp8_e5m2", "meta-llama/Meta-Llama-3-8B-Instruct",
+         "meta-llama/Meta-Llama-3-8B-Instruct", None),
+        # Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
+        ("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf",
+         "meta-llama/Llama-2-7b-chat-hf",
+         "./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json")
+    ])
+# Due to low-precision numerical divergence, we only test logprob of 4 tokens
+@pytest.mark.parametrize("max_tokens", [4])
+@pytest.mark.parametrize("enforce_eager", [False, True])
+@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
+# NOTE: Increasing this in this suite will fail CI because we currently cannot
+# reset distributed env properly. Use a value > 1 just when you test.
+@pytest.mark.parametrize("tensor_parallel_size", [1])
+# Due to low-precision numerical divergence, this test is too sensitive for
+# the async postprocessor
+@pytest.mark.parametrize("disable_async_output_proc", [True])
+def test_models(
+    aphrodite_runner,
+    example_prompts,
+    kv_cache_dtype: str,
+    base_model: str,
+    test_model: str,
+    scale_path: Optional[str],
+    max_tokens: int,
+    enforce_eager: bool,
+    backend: str,
+    tensor_parallel_size: int,
+    disable_async_output_proc: bool,
+    monkeypatch,
+) -> None:
+    """
+    Only checks log probs match to cover the discrepancy in
+    numerical sensitive kernels.
+    """
+    override_backend_env_variable(monkeypatch, backend)
+
+    MAX_MODEL_LEN = 1024
+    NUM_LOG_PROBS = 8
+
+    with aphrodite_runner(
+            base_model,
+            max_model_len=MAX_MODEL_LEN,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            kv_cache_dtype="auto",
+            disable_async_output_proc=disable_async_output_proc,
+    ) as aphrodite_model:
+        baseline_outputs = aphrodite_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, NUM_LOG_PROBS)
+
+    extra_kwargs = {}
+    if scale_path is not None:
+        extra_kwargs["quantization_param_path"] = scale_path
+
+    with aphrodite_runner(
+            test_model,
+            max_model_len=MAX_MODEL_LEN,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            kv_cache_dtype=kv_cache_dtype,
+            disable_async_output_proc=disable_async_output_proc,
+            **extra_kwargs,
+    ) as aphrodite_model:
+        test_outputs = aphrodite_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, NUM_LOG_PROBS)
+
+    check_logprobs_close(
+        outputs_0_lst=baseline_outputs,
+        outputs_1_lst=test_outputs,
+        name_0="fp16_kv_cache",
+        name_1="fp8_kv_cache",
+    )

+ 20 - 6
tests/models/test_gguf.py → tests/models/decoder_only/language/test_gguf.py

@@ -7,10 +7,11 @@ import os
 
 import pytest
 from huggingface_hub import hf_hub_download
+from transformers import AutoTokenizer
 
 from tests.quantization.utils import is_quant_method_supported
 
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
 
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
 
@@ -20,7 +21,7 @@ MAX_MODEL_LEN = 1024
 MODELS = [
     ("TinyLlama/TinyLlama-1.1B-Chat-v1.0",
      hf_hub_download("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
-                     filename="tinyllama-1.1b-chat-v1.0.Q4_0.gguf")),
+                     filename="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf")),
     ("TinyLlama/TinyLlama-1.1B-Chat-v1.0",
      hf_hub_download("duyntnet/TinyLlama-1.1B-Chat-v1.0-imatrix-GGUF",
                      filename="TinyLlama-1.1B-Chat-v1.0-IQ4_XS.gguf")),
@@ -39,22 +40,36 @@ MODELS = [
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("tp_size", [1, 2])
 def test_models(
+    num_gpus_available,
     aphrodite_runner,
     example_prompts,
     model,
     dtype: str,
     max_tokens: int,
     num_logprobs: int,
+    tp_size: int,
 ) -> None:
+    if num_gpus_available < tp_size:
+        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
+
     original_model, gguf_model = model
 
+    tokenizer = AutoTokenizer.from_pretrained(original_model)
+    messages = [[{
+        'role': 'user',
+        'content': prompt
+    }] for prompt in example_prompts]
+    example_prompts = tokenizer.apply_chat_template(messages,
+                                                    tokenize=False,
+                                                    add_generation_prompt=True)
+
     # Run unquantized model.
     with aphrodite_runner(model_name=original_model,
                      dtype=dtype,
                      max_model_len=MAX_MODEL_LEN,
-                     enforce_eager=True,
-                     tensor_parallel_size=1) as original_model:
+                     tensor_parallel_size=tp_size) as original_model:
 
         original_outputs = original_model.generate_greedy_logprobs(
             example_prompts[:-1], max_tokens, num_logprobs)
@@ -63,8 +78,7 @@ def test_models(
     with aphrodite_runner(model_name=gguf_model,
                      dtype=dtype,
                      max_model_len=MAX_MODEL_LEN,
-                     enforce_eager=True,
-                     tensor_parallel_size=1) as gguf_model:
+                     tensor_parallel_size=tp_size) as gguf_model:
         gguf_outputs = gguf_model.generate_greedy_logprobs(
             example_prompts[:-1], max_tokens, num_logprobs)
 

+ 2 - 2
tests/models/test_gptq_marlin.py → tests/models/decoder_only/language/test_gptq_marlin.py

@@ -12,10 +12,10 @@ import os
 
 import pytest
 
-from aphrodite.modeling.layers.rotary_embedding import _ROPE_DICT
 from tests.quantization.utils import is_quant_method_supported
+from aphrodite.modeling.layers.rotary_embedding import _ROPE_DICT
 
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
 
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
 

+ 2 - 1
tests/models/test_gptq_marlin_24.py → tests/models/decoder_only/language/test_gptq_marlin_24.py

@@ -10,9 +10,10 @@ from dataclasses import dataclass
 
 import pytest
 
-from tests.models.utils import check_logprobs_close
 from tests.quantization.utils import is_quant_method_supported
 
+from ...utils import check_logprobs_close
+
 
 @dataclass
 class ModelPair:

+ 9 - 6
tests/models/test_mistral.py → tests/models/decoder_only/language/test_granite.py

@@ -1,17 +1,20 @@
-"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
+"""Compare the outputs of HF and Aphrodite for Granite models using greedy sampling.
 
-Run `pytest tests/models/test_mistral.py`.
+Run `pytest tests/models/test_granite.py`.
 """
 import pytest
+import transformers
 
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
 
 MODELS = [
-    "mistralai/Mistral-7B-Instruct-v0.1",
-    "mistralai/Mistral-7B-Instruct-v0.3",
+    "ibm/PowerLM-3b",
 ]
 
 
+# GraniteForCausalLM will be in transformers >= 4.45
+@pytest.mark.skipif(transformers.__version__ < "4.45",
+                    reason="granite model test requires transformers >= 4.45")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])
@@ -25,7 +28,7 @@ def test_models(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
-    # TODO: Sliding window should be tested separately.
+    # TODO(sang): Sliding window should be tested separately.
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
             example_prompts, max_tokens, num_logprobs)

+ 7 - 10
tests/models/test_jamba.py → tests/models/decoder_only/language/test_jamba.py

@@ -1,7 +1,8 @@
 import pytest
 
 from aphrodite.worker.model_runner import _get_graph_batch_size
-from tests.models.utils import check_outputs_equal
+
+from ...utils import check_outputs_equal
 
 MODELS = ["ai21labs/Jamba-tiny-random"]
 
@@ -25,17 +26,15 @@ def test_models(
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
     with aphrodite_runner(model, dtype=dtype) as aphrodite_model:
-        aphrodite_outputs = aphrodite_model.generate_greedy(
-            example_prompts, max_tokens)
+        aphrodite_outputs = aphrodite_model.generate_greedy(example_prompts, max_tokens)
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]
         aphrodite_output_ids, aphrodite_output_str = aphrodite_outputs[i]
         assert hf_output_str == aphrodite_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nAphrodite: "
-            f"{aphrodite_output_str!r}")
+            f"Test{i}:\nHF: {hf_output_str!r}\nAPHRODITE: {aphrodite_output_str!r}")
         assert hf_output_ids == aphrodite_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nAphrodite: {aphrodite_output_ids}")
+            f"Test{i}:\nHF: {hf_output_ids}\nAPHRODITE: {aphrodite_output_ids}")
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -114,8 +113,7 @@ def test_models_preemption_recompute(
 
         aphrodite_model.model.llm_engine.scheduler[
             0].ENABLE_ARTIFICIAL_PREEMPT = False
-        aphrodite_outputs = aphrodite_model.generate_greedy(
-            example_prompts, max_tokens)
+        aphrodite_outputs = aphrodite_model.generate_greedy(example_prompts, max_tokens)
 
     check_outputs_equal(
         outputs_0_lst=preempt_aphrodite_outputs,
@@ -140,8 +138,7 @@ def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
     # statelessness mechanism where it can cleanup new incoming requests in
     # a single step.
     try:
-        with aphrodite_runner(model, dtype=dtype,
-                              max_num_seqs=10) as aphrodite_model:
+        with aphrodite_runner(model, dtype=dtype, max_num_seqs=10) as aphrodite_model:
             aphrodite_model.generate_greedy([example_prompts[0]] * 100, 10)
     except ValueError:
         pytest.fail("Jamba inner state wasn't cleaned up properly between"

+ 1 - 1
tests/models/test_marlin.py → tests/models/decoder_only/language/test_marlin.py

@@ -16,7 +16,7 @@ import pytest
 
 from tests.quantization.utils import is_quant_method_supported
 
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
 
 
 @dataclass

+ 175 - 0
tests/models/decoder_only/language/test_mistral.py

@@ -0,0 +1,175 @@
+"""Compare the outputs of HF and Aphrodite for Mistral models using greedy
+sampling.
+
+Run `pytest tests/models/test_mistral.py`.
+"""
+import pytest
+
+from aphrodite import LLM, SamplingParams
+
+from ...utils import check_logprobs_close
+
+MODELS = [
+    "mistralai/Mistral-7B-Instruct-v0.1",
+    "mistralai/Mistral-7B-Instruct-v0.3",
+    # Mistral-Nemo is to big for CI, but passes locally
+    # "mistralai/Mistral-Nemo-Instruct-2407"
+]
+
+SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
+SYMBOLIC_LANG_PROMPTS = [
+    "勇敢な船乗りについての詩を書く",  # japanese
+    "寫一首關於勇敢的水手的詩",  # chinese
+]
+
+# for function calling
+TOOLS = [{
+    "type": "function",
+    "function": {
+        "name": "get_current_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "city": {
+                    "type":
+                    "string",
+                    "description":
+                    "The city to find the weather for, e.g. 'San Francisco'"
+                },
+                "state": {
+                    "type":
+                    "string",
+                    "description":
+                    "the two-letter abbreviation for the state that the city is"
+                    " in, e.g. 'CA' which would mean 'California'"
+                },
+                "unit": {
+                    "type": "string",
+                    "description": "The unit to fetch the temperature in",
+                    "enum": ["celsius", "fahrenheit"]
+                }
+            },
+            "required": ["city", "state", "unit"]
+        }
+    }
+}]
+MSGS = [{
+    "role":
+    "user",
+    "content": ("Can you tell me what the temperate"
+                " will be in Dallas, in fahrenheit?")
+}]
+EXPECTED_FUNC_CALL = (
+    '[{"name": "get_current_weather", "arguments": '
+    '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]')
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    hf_runner,
+    aphrodite_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    # TODO(sang): Sliding window should be tested separately.
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
+
+    with aphrodite_runner(model, dtype=dtype,
+                     tokenizer_mode="mistral") as aphrodite_model:
+        aphrodite_outputs = aphrodite_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=aphrodite_outputs,
+        name_0="hf",
+        name_1="aphrodite",
+    )
+
+
+@pytest.mark.parametrize("model", MODELS[1:])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_mistral_format(
+    aphrodite_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    with aphrodite_runner(
+            model,
+            dtype=dtype,
+            tokenizer_mode="auto",
+            load_format="safetensors",
+            config_format="hf",
+    ) as hf_format_model:
+        hf_format_outputs = hf_format_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    with aphrodite_runner(
+            model,
+            dtype=dtype,
+            tokenizer_mode="mistral",
+            load_format="mistral",
+            config_format="mistral",
+    ) as mistral_format_model:
+        mistral_format_outputs = mistral_format_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=hf_format_outputs,
+        outputs_1_lst=mistral_format_outputs,
+        name_0="hf",
+        name_1="mistral",
+    )
+
+
+@pytest.mark.parametrize("model", MODELS[1:])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("prompt", SYMBOLIC_LANG_PROMPTS)
+def test_mistral_symbolic_languages(
+    model: str,
+    dtype: str,
+    prompt: str,
+) -> None:
+    prompt = "hi"
+    msg = {"role": "user", "content": prompt}
+    llm = LLM(model=model,
+              dtype=dtype,
+              max_model_len=8192,
+              tokenizer_mode="mistral",
+              config_format="mistral",
+              load_format="mistral")
+    outputs = llm.chat([msg], sampling_params=SAMPLING_PARAMS)
+    assert "�" not in outputs[0].outputs[0].text.strip()
+
+
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("model", MODELS[1:])  # v1 can't do func calling
+def test_mistral_function_calling(
+    aphrodite_runner,
+    model: str,
+    dtype: str,
+) -> None:
+    with aphrodite_runner(model,
+                     dtype=dtype,
+                     tokenizer_mode="mistral",
+                     config_format="mistral",
+                     load_format="mistral") as aphrodite_model:
+        outputs = aphrodite_model.model.chat(MSGS,
+                                        tools=TOOLS,
+                                        sampling_params=SAMPLING_PARAMS)
+
+        assert outputs[0].outputs[0].text.strip() == EXPECTED_FUNC_CALL

+ 79 - 0
tests/models/decoder_only/language/test_modelopt.py

@@ -0,0 +1,79 @@
+# flake8: noqa
+"""Tests Model Optimizer fp8 models against ground truth generation
+Note: these tests will only pass on H100
+"""
+import os
+from typing import List
+
+import pytest
+from transformers import AutoTokenizer
+
+from aphrodite import LLM, SamplingParams
+from tests.quantization.utils import is_quant_method_supported
+
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+MAX_MODEL_LEN = 1024
+
+MODELS = ["nvidia/Llama-3.1-8B-Instruct-FP8"]
+
+EXPECTED_STRS_MAP = {
+    "nvidia/Llama-3.1-8B-Instruct-FP8": [
+        "You're referring to VLLM, a high-performance Large Language Model (LLM) inference and",
+        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
+        'The comparison between artificial intelligence (AI) and human intelligence in terms of processing information is a complex and',
+        'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
+        '**The Spark of Imagination**\n\nZeta-5, a sleek and efficient robot, whir',
+        'The COVID-19 pandemic has had a profound impact on global economic structures and business models, leading to',
+        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
+        'Here are the translations:\n\n**Japanese:** 「早起きは早く獲物をとる'
+    ]
+}
+
+
+# This test compares against golden strings for exact match since
+# there is no baseline implementation to compare against
+# and is unstable w.r.t specifics of the fp8 implementation or
+# the hardware being run on.
+# Disabled to prevent it from breaking the build
+@pytest.mark.skip(
+    reason=
+    "Prevent unstable test based on golden strings from breaking the build.")
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+                    reason="fp8 is not supported on this GPU type.")
+@pytest.mark.parametrize("model_name", MODELS)
+def test_models(example_prompts, model_name) -> None:
+    model = LLM(
+        model=model_name,
+        max_model_len=MAX_MODEL_LEN,
+        trust_remote_code=True,
+        enforce_eager=True,
+        quantization="modelopt",
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    formatted_prompts = [
+        tokenizer.apply_chat_template([{
+            "role": "user",
+            "content": prompt
+        }],
+                                      tokenize=False,
+                                      add_generation_prompt=True)
+        for prompt in example_prompts
+    ]
+    params = SamplingParams(max_tokens=20, temperature=0)
+    generations: List[str] = []
+    # Note: these need to be run 1 at a time due to numerical precision,
+    # since the expected strs were generated this way.
+    for prompt in formatted_prompts:
+        outputs = model.generate(prompt, params)
+        generations.append(outputs[0].outputs[0].text)
+    del model
+
+    print(model_name, generations)
+    expected_strs = EXPECTED_STRS_MAP[model_name]
+    for i in range(len(example_prompts)):
+        generated_str = generations[i]
+        expected_str = expected_strs[i]
+        assert expected_str == generated_str, (
+            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")

+ 2 - 3
tests/models/test_models.py → tests/models/decoder_only/language/test_models.py

@@ -7,7 +7,7 @@ Run `pytest tests/models/test_models.py`.
 """
 import pytest
 
-from .utils import check_outputs_equal
+from ...utils import check_outputs_equal
 
 MODELS = [
     "facebook/opt-125m",
@@ -41,8 +41,7 @@ def test_models(
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
     with aphrodite_runner(model, dtype=dtype) as aphrodite_model:
-        aphrodite_outputs = aphrodite_model.generate_greedy(
-            example_prompts, max_tokens)
+        aphrodite_outputs = aphrodite_model.generate_greedy(example_prompts, max_tokens)
 
     check_outputs_equal(
         outputs_0_lst=hf_outputs,

+ 112 - 0
tests/models/decoder_only/language/test_phimoe.py

@@ -0,0 +1,112 @@
+"""Compare the outputs of HF and Aphrodite for moe models using greedy sampling.
+
+Run `pytest tests/models/test_phimoe.py`.
+"""
+import pytest
+import torch
+
+from aphrodite.common.utils import is_cpu
+
+from ...utils import check_logprobs_close
+
+MODELS = [
+    "microsoft/Phi-3.5-MoE-instruct",
+]
+
+
+def test_phimoe_routing_function():
+    from aphrodite.modeling.models.phimoe import phimoe_routing_function
+    test_case = {
+        0: {
+            "hidden_states":
+            torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
+                         dtype=torch.float32,
+                         requires_grad=False).view(4, 2),
+            "gating_output":
+            torch.tensor([0.1, 0.2, 0.3, 0.4],
+                         dtype=torch.float32,
+                         requires_grad=False),
+            "topk":
+            2,
+            "renormalize":
+            False,
+        },
+        1: {
+            "hidden_states":
+            torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
+                         dtype=torch.float32,
+                         requires_grad=False).view(4, 2),
+            "gating_output":
+            torch.tensor([0.4, 0.2, 0.3, 0.4],
+                         dtype=torch.float32,
+                         requires_grad=False),
+            "topk":
+            2,
+            "renormalize":
+            False,
+        }
+    }
+
+    ground_truth = {
+        0: {
+            "topk_weights":
+            torch.tensor([1., 1.], dtype=torch.float32, requires_grad=False),
+            "topk_ids":
+            torch.tensor([3, 2], dtype=torch.long, requires_grad=False),
+        },
+        1: {
+            "topk_weights":
+            torch.tensor([0.5, 1.], dtype=torch.float32, requires_grad=False),
+            "topk_ids":
+            torch.tensor([0, 3], dtype=torch.long, requires_grad=False),
+        }
+    }
+
+    for test_id in test_case:
+        topk_weights, topk_ids = phimoe_routing_function(**test_case[test_id])
+        assert torch.allclose(topk_weights,
+                              ground_truth[test_id]["topk_weights"])
+        assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"])
+
+
+def get_gpu_memory():
+    try:
+        props = torch.cuda.get_device_properties(torch.cuda.current_device())
+        gpu_memory = props.total_memory / (1024**3)
+        return gpu_memory
+    except Exception:
+        return 0
+
+
+@pytest.mark.skipif(condition=is_cpu(),
+                    reason="This test takes a lot time to run on CPU, "
+                    "and aphrodite CI's disk space is not enough for this "
+                    "model.")
+@pytest.mark.skipif(condition=get_gpu_memory() < 100,
+                    reason="Skip this test if GPU memory is insufficient.")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    hf_runner,
+    aphrodite_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
+
+    with aphrodite_runner(model, dtype=dtype) as aphrodite_model:
+        aphrodite_outputs = aphrodite_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=aphrodite_outputs,
+        name_0="hf",
+        name_1="aphrodite",
+    )

+ 0 - 0
tests/models/decoder_only/vision_language/__init__.py


+ 5 - 8
tests/models/test_blip2.py → tests/models/decoder_only/vision_language/test_blip2.py

@@ -3,13 +3,11 @@ from typing import List, Optional, Tuple
 import pytest
 from transformers import AutoModelForVision2Seq, AutoTokenizer
 
-from aphrodite.common.sequence import SampleLogprobs
 from aphrodite.multimodal.utils import rescale_image_size
+from aphrodite.common.sequence import SampleLogprobs
 
-from ..conftest import IMAGE_ASSETS
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
+from ....conftest import IMAGE_ASSETS
+from ...utils import check_logprobs_close
 
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
     "stop_sign":
@@ -56,7 +54,7 @@ def test_models(hf_runner, aphrodite_runner, image_assets, model, size_factors,
                 dtype: str, max_tokens: int, num_logprobs: int) -> None:
     """Inference result should be the same between hf and aphrodite.
 
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
     For huggingface runner, we provide the PIL images as input.
     For aphrodite runner, we provide MultiModalData objects and corresponding
     MultiModalConfig as input.
@@ -71,8 +69,7 @@ def test_models(hf_runner, aphrodite_runner, image_assets, model, size_factors,
     ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
 
     # max_model_len should be greater than image_feature_size
-    with aphrodite_runner(model, dtype=dtype,
-                          enforce_eager=True) as aphrodite_model:
+    with aphrodite_runner(model, dtype=dtype, enforce_eager=True) as aphrodite_model:
         aphrodite_outputs_per_image = [
             aphrodite_model.generate_greedy_logprobs(prompts,
                                                 max_tokens,

+ 42 - 0
tests/models/decoder_only/vision_language/test_broadcast.py

@@ -0,0 +1,42 @@
+import pytest
+
+from ....utils import multi_gpu_test
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
+@pytest.mark.parametrize("model", [
+    "llava-hf/llava-1.5-7b-hf",
+    "llava-hf/llava-v1.6-mistral-7b-hf",
+    "facebook/chameleon-7b",
+])
+def test_models(hf_runner, vllm_runner, image_assets,
+                distributed_executor_backend, model) -> None:
+
+    dtype = "half"
+    max_tokens = 5
+    num_logprobs = 5
+    tensor_parallel_size = 2
+
+    if model.startswith("llava-hf/llava-1.5"):
+        from .test_llava import models, run_test
+    elif model.startswith("llava-hf/llava-v1.6"):
+        from .test_llava_next import models, run_test  # type: ignore[no-redef]
+    elif model.startswith("facebook/chameleon"):
+        from .test_chameleon import models, run_test  # type: ignore[no-redef]
+    else:
+        raise NotImplementedError(f"Unsupported model: {model}")
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model=models[0],
+        # So that LLaVA-NeXT processor may return nested list
+        size_factors=[0.25, 0.5, 1.0],
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+    )

+ 6 - 8
tests/models/test_chameleon.py → tests/models/decoder_only/vision_language/test_chameleon.py

@@ -3,13 +3,11 @@ from typing import List, Optional, Type
 import pytest
 from transformers import AutoModelForVision2Seq, BatchEncoding
 
-from aphrodite.common.utils import STR_DTYPE_TO_TORCH_DTYPE
 from aphrodite.multimodal.utils import rescale_image_size
+from aphrodite.common.utils import STR_DTYPE_TO_TORCH_DTYPE
 
-from ..conftest import IMAGE_ASSETS, AphroditeRunner, HfRunner, _ImageAssets
-from .utils import check_outputs_equal
-
-pytestmark = pytest.mark.vlm
+from ....conftest import IMAGE_ASSETS, HfRunner, AphroditeRunner, _ImageAssets
+from ...utils import check_outputs_equal
 
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
     "stop_sign":
@@ -36,7 +34,7 @@ def run_test(
 ):
     """Inference result should be the same between hf and aphrodite.
 
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
     For huggingface runner, we provide the PIL images as input.
     For aphrodite runner, we provide MultiModalDataDict objects 
     and corresponding vision language config as input.
@@ -85,8 +83,8 @@ def run_test(
 
     for hf_outputs, aphrodite_outputs in zip(hf_outputs_per_image,
                                         aphrodite_outputs_per_image):
-        # HF Logprobs include image tokens, unlike Aphrodite, so we don't
-        # directly compare them
+        # HF Logprobs include image tokens, unlike Aphrodite, so we don't directly
+        # compare them
         check_outputs_equal(
             outputs_0_lst=[outputs[:2] for outputs in hf_outputs],
             outputs_1_lst=[outputs[:2] for outputs in aphrodite_outputs],

+ 5 - 8
tests/models/test_fuyu.py → tests/models/decoder_only/vision_language/test_fuyu.py

@@ -2,14 +2,12 @@ from typing import List, Optional, Tuple, Type
 
 import pytest
 
+from aphrodite.multimodal.utils import rescale_image_size
 from aphrodite.common.sequence import SampleLogprobs
 from aphrodite.common.utils import is_cpu
-from aphrodite.multimodal.utils import rescale_image_size
-
-from ..conftest import IMAGE_ASSETS, AphroditeRunner, HfRunner, _ImageAssets
-from .utils import check_logprobs_close
 
-pytestmark = pytest.mark.vlm
+from ....conftest import IMAGE_ASSETS, HfRunner, AphroditeRunner, _ImageAssets
+from ...utils import check_logprobs_close
 
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
     "stop_sign":
@@ -46,7 +44,7 @@ def run_test(
 ):
     """Inference result should be the same between hf and aphrodite.
 
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
     For huggingface runner, we provide the PIL images as input.
     For aphrodite runner, we provide MultiModalDataDict objects 
     and corresponding MultiModalConfig as input.
@@ -99,8 +97,7 @@ def run_test(
         check_logprobs_close(
             outputs_0_lst=hf_outputs,
             outputs_1_lst=[
-                aphrodite_to_hf_output(aphrodite_output)
-                for aphrodite_output in aphrodite_outputs
+                aphrodite_to_hf_output(aphrodite_output) for aphrodite_output in aphrodite_outputs
             ],
             name_0="hf",
             name_1="aphrodite",

+ 77 - 0
tests/models/decoder_only/vision_language/test_intern_vit.py

@@ -0,0 +1,77 @@
+from typing import Optional
+
+import pytest
+import torch
+import torch.nn as nn
+from huggingface_hub import snapshot_download
+from transformers import AutoConfig, AutoModel, CLIPImageProcessor
+
+from ....conftest import _ImageAssets, cleanup
+
+# we use snapshot_download to prevent conflicts between
+# dynamic_module and trust_remote_code for hf_runner
+DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"]
+models = [
+    snapshot_download("OpenGVLab/InternViT-300M-448px",
+                      allow_patterns=DOWNLOAD_PATTERN),
+    snapshot_download("OpenGVLab/InternViT-6B-448px-V1-5",
+                      allow_patterns=DOWNLOAD_PATTERN),
+]
+
+
+def run_intern_vit_test(
+    image_assets: _ImageAssets,
+    model: str,
+    *,
+    dtype: str,
+    distributed_executor_backend: Optional[str] = None,
+):
+    img_processor = CLIPImageProcessor.from_pretrained(model)
+    images = [asset.pil_image for asset in image_assets]
+    pixel_values = [
+        img_processor(images, return_tensors='pt').pixel_values.to(dtype)
+        for images in images
+    ]
+
+    config = AutoConfig.from_pretrained(model, trust_remote_code=True)
+    if not getattr(config, "norm_type", None):
+        config.norm_type = "rms_norm"
+
+    hf_model = AutoModel.from_pretrained(model,
+                                         torch_dtype=dtype,
+                                         trust_remote_code=True).to("cuda")
+    hf_outputs_per_image = [
+        hf_model(pixel_value.to("cuda")).last_hidden_state
+        for pixel_value in pixel_values
+    ]
+
+    from aphrodite.modeling.models.intern_vit import InternVisionModel
+    aphrodite_model = InternVisionModel(config)
+    aphrodite_model.load_weights(hf_model.state_dict().items())
+
+    del hf_model
+    cleanup()
+
+    aphrodite_model = aphrodite_model.to("cuda", dtype)
+    aphrodite_outputs_per_image = [
+        aphrodite_model(pixel_values=pixel_value.to("cuda"))
+        for pixel_value in pixel_values
+    ]
+    del aphrodite_model
+    cleanup()
+
+    cos_similar = nn.CosineSimilarity(dim=-1)
+    for aphrodite_output, hf_output in zip(aphrodite_outputs_per_image,
+                                      hf_outputs_per_image):
+        assert cos_similar(aphrodite_output, hf_output).mean() > 0.99
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", [torch.half])
+@torch.inference_mode()
+def test_models(dist_init, image_assets, model, dtype: str) -> None:
+    run_intern_vit_test(
+        image_assets,
+        model,
+        dtype=dtype,
+    )

+ 7 - 7
tests/models/test_internvl.py → tests/models/decoder_only/vision_language/test_internvl.py

@@ -6,14 +6,12 @@ import torch
 from PIL.Image import Image
 from transformers import AutoConfig
 
-from aphrodite.common.utils import is_cpu
 from aphrodite.multimodal.utils import rescale_image_size
+from aphrodite.utils import is_cpu
 
-from ..conftest import (IMAGE_ASSETS, AphroditeRunner, HfRunner,
-                        PromptImageInput, _ImageAssets)
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
+from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, AphroditeRunner,
+                          _ImageAssets)
+from ...utils import check_logprobs_close
 
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
     "stop_sign":
@@ -78,7 +76,7 @@ def run_test(
 ):
     """Inference result should be the same between hf and aphrodite.
 
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
     For huggingface runner, we provide the PIL images as input.
     For aphrodite runner, we provide MultiModalDataDict objects 
     and corresponding MultiModalConfig as input.
@@ -341,10 +339,12 @@ def test_different_num_patches(hf_runner, aphrodite_runner, image_assets, model,
                                size_factors, dtype: str, max_tokens: int,
                                num_logprobs: int) -> None:
     images = [asset.pil_image.resize((896, 896)) for asset in image_assets]
+
     inputs_batching = [(
         [prompt for _ in size_factors],
         [rescale_image_size(image, factor) for factor in size_factors],
     ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
     inputs_multi_images = [
         ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
          [[rescale_image_size(image, factor) for image in images]

+ 7 - 9
tests/models/test_llava.py → tests/models/decoder_only/vision_language/test_llava.py

@@ -4,15 +4,13 @@ import pytest
 from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
                           BatchEncoding)
 
-from aphrodite.common.sequence import SampleLogprobs
-from aphrodite.common.utils import STR_DTYPE_TO_TORCH_DTYPE
 from aphrodite.multimodal.utils import rescale_image_size
+from aphrodite.sequence import SampleLogprobs
+from aphrodite.utils import STR_DTYPE_TO_TORCH_DTYPE
 
-from ..conftest import (IMAGE_ASSETS, AphroditeRunner, HfRunner,
-                        PromptImageInput, _ImageAssets)
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
+from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, AphroditeRunner,
+                          _ImageAssets)
+from ...utils import check_logprobs_close
 
 _LIMIT_IMAGE_PER_PROMPT = 4
 
@@ -143,7 +141,7 @@ def _run_test(
 ):
     """Inference result should be the same between hf and aphrodite.
 
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
     For huggingface runner, we provide the PIL images as input.
     For aphrodite runner, we provide MultiModalDataDict objects 
     and corresponding MultiModalConfig as input.
@@ -239,7 +237,7 @@ def _run_test(
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
 def test_models(hf_runner, aphrodite_runner, image_assets, model, size_factors,
-                dtype: str, max_tokens: int, num_logprobs: int) -> None:
+                dtype, max_tokens, num_logprobs) -> None:
     run_test(
         hf_runner,
         aphrodite_runner,

+ 4 - 6
tests/models/test_llava_image_embeds.py → tests/models/decoder_only/vision_language/test_llava_image_embeds.py

@@ -3,12 +3,10 @@ from typing import List, Optional, Tuple, Type
 import pytest
 from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
 
-from aphrodite.common.sequence import SampleLogprobs
+from aphrodite.sequence import SampleLogprobs
 
-from ..conftest import IMAGE_ASSETS, AphroditeRunner, HfRunner, _ImageAssets
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
+from ....conftest import IMAGE_ASSETS, HfRunner, AphroditeRunner, _ImageAssets
+from ...utils import check_logprobs_close
 
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
     "stop_sign":
@@ -62,7 +60,7 @@ def run_test(
 ):
     """Inference result should be the same between hf and aphrodite.
 
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
     For huggingface runner, we provide the PIL images as input.
     For aphrodite runner, we provide MultiModalDataDict objects 
     and corresponding vision language config as input.

+ 6 - 8
tests/models/test_llava_next.py → tests/models/decoder_only/vision_language/test_llava_next.py

@@ -6,11 +6,9 @@ from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
 from aphrodite.common.sequence import SampleLogprobs
 from aphrodite.multimodal.utils import rescale_image_size
 
-from ..conftest import (IMAGE_ASSETS, AphroditeRunner, HfRunner,
-                        PromptImageInput, _ImageAssets)
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
+from ....conftest import (IMAGE_ASSETS, AphroditeRunner, HfRunner,
+                          PromptImageInput, _ImageAssets)
+from ...utils import check_logprobs_close
 
 _LIMIT_IMAGE_PER_PROMPT = 4
 
@@ -197,7 +195,7 @@ def test_models(hf_runner, aphrodite_runner, image_assets, model, size_factors,
                 dtype, max_tokens, num_logprobs) -> None:
     """Inference result should be the same between hf and aphrodite.
 
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
     For huggingface runner, we provide the PIL images as input.
     For aphrodite runner, we provide MultiModalDataDict objects
     and corresponding MultiModalConfig as input.
@@ -225,8 +223,8 @@ def test_models(hf_runner, aphrodite_runner, image_assets, model, size_factors,
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
-def test_models_fixed_sizes(hf_runner, aphrodite_runner, image_assets, model,
-                            sizes, dtype, max_tokens, num_logprobs) -> None:
+def test_models_fixed_sizes(hf_runner, aphrodite_runner, image_assets, model, sizes,
+                            dtype, max_tokens, num_logprobs) -> None:
     run_test(
         hf_runner,
         aphrodite_runner,

+ 62 - 84
tests/models/test_llava_next_video.py → tests/models/decoder_only/vision_language/test_llava_next_video.py

@@ -4,47 +4,48 @@ import pytest
 import transformers
 from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
 
-from aphrodite.common.sequence import SampleLogprobs
 from aphrodite.multimodal.utils import (rescale_video_size, resize_video,
-                                        sample_frames_from_video)
+                                   sample_frames_from_video)
+from aphrodite.sequence import SampleLogprobs
 
-from ..conftest import VIDEO_ASSETS, AphroditeRunner, HfRunner, _VideoAssets
-from .utils import check_logprobs_close
+from ....conftest import VIDEO_ASSETS, HfRunner, AphroditeRunner, _VideoAssets
+from ...utils import check_logprobs_close
 
-pytestmark = pytest.mark.vlm
 _PREFACE = (
     "A chat between a curious human and an artificial intelligence assistant. "
     "The assistant gives helpful, detailed, and polite answers to the human's "
-    "questions."
-)
-HF_VIDEO_PROMPTS = VIDEO_ASSETS.prompts(
-    {
-        "sample_demo_1": f"{_PREFACE}USER: <video>\nWhy is this video funny? "
-        "ASSISTANT:"
-    }
-)
+    "questions.")
+
+HF_VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
+    "sample_demo_1":
+    f"{_PREFACE}USER: <video>\nWhy is this video funny? ASSISTANT:"
+})
+
 models = ["llava-hf/LLaVA-NeXT-Video-7B-hf"]
 
 
-def aphrodite_to_hf_output(
-    aphrodite_output: Tuple[List[int],
-                            str, Optional[SampleLogprobs]], model: str
-):
+def aphrodite_to_hf_output(aphrodite_output: Tuple[List[int], str,
+                                         Optional[SampleLogprobs]],
+                      model: str):
     """Sanitize aphrodite output to be comparable with hf output."""
     output_ids, output_str, out_logprobs = aphrodite_output
+
     config = AutoConfig.from_pretrained(model)
     video_token_id = config.video_token_index
+
     tokenizer = AutoTokenizer.from_pretrained(model)
     eos_token_id = tokenizer.eos_token_id
+
     hf_output_ids = [
-        token_id
-        for idx, token_id in enumerate(output_ids)
+        token_id for idx, token_id in enumerate(output_ids)
         if token_id != video_token_id or output_ids[idx - 1] != video_token_id
     ]
+
     assert output_str[0] == " "
     hf_output_str = output_str[1:]
     if hf_output_ids[-1] == eos_token_id:
         hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
+
     return hf_output_ids, hf_output_str, out_logprobs
 
 
@@ -103,53 +104,50 @@ def run_test(
         sample_frames_from_video(asset.np_ndarrays, num_frames)
         for asset in video_assets
     ]
+
     for video in videos:
         print(video.shape)
+
     if size_factors is not None:
-        inputs_per_video = [
-            (
-                [prompt for _ in size_factors],
-                [rescale_video_size(video, factor) for factor in size_factors],
-            )
-            for video, prompt in zip(videos, HF_VIDEO_PROMPTS)
-        ]
+        inputs_per_video = [(
+            [prompt for _ in size_factors],
+            [rescale_video_size(video, factor) for factor in size_factors],
+        ) for video, prompt in zip(videos, HF_VIDEO_PROMPTS)]
     elif sizes is not None:
-        inputs_per_video = [
-            (
-                [prompt for _ in sizes],
-                [resize_video(video, size) for size in sizes],
-            )
-            for video, prompt in zip(videos, HF_VIDEO_PROMPTS)
-        ]
+        inputs_per_video = [(
+            [prompt for _ in sizes],
+            [resize_video(video, size) for size in sizes],
+        ) for video, prompt in zip(videos, HF_VIDEO_PROMPTS)]
     else:
         raise ValueError("You must provide either `size_factors` or `sizes`")
+
     # max_model_len should be greater than image_feature_size
-    with aphrodite_runner(
-        model,
-        dtype=dtype,
-        max_model_len=4096,
-        tensor_parallel_size=tensor_parallel_size,
-        distributed_executor_backend=distributed_executor_backend,
-        enforce_eager=True,
-    ) as aphrodite_model:
+    with aphrodite_runner(model,
+                     dtype=dtype,
+                     max_model_len=4096,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as aphrodite_model:
         aphrodite_outputs_per_video = [
-            aphrodite_model.generate_greedy_logprobs(
-                prompts, max_tokens, num_logprobs=num_logprobs, videos=videos
-            )
+            aphrodite_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                videos=videos)
             for prompts, videos in inputs_per_video
         ]
-    with hf_runner(
-        model, dtype=dtype, auto_cls=AutoModelForVision2Seq
-    ) as hf_model:
+
+    with hf_runner(model, dtype=dtype,
+                   auto_cls=AutoModelForVision2Seq) as hf_model:
         hf_outputs_per_video = [
-            hf_model.generate_greedy_logprobs_limit(
-                prompts, max_tokens, num_logprobs=num_logprobs, videos=videos
-            )
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    videos=videos)
             for prompts, videos in inputs_per_video
         ]
-    for hf_outputs, aphrodite_outputs in zip(
-        hf_outputs_per_video, aphrodite_outputs_per_video
-    ):
+
+    for hf_outputs, aphrodite_outputs in zip(hf_outputs_per_video,
+                                        aphrodite_outputs_per_video):
         # TODO: Check whether using original CLIPVisionModel can improve
         # consistency against HF
         check_logprobs_close(
@@ -163,10 +161,8 @@ def run_test(
         )
 
 
-@pytest.mark.skipif(
-    transformers.__version__ < "4.45",
-    reason="Waiting for next transformers release",
-)
+@pytest.mark.skipif(transformers.__version__ < "4.45",
+                    reason="Waiting for next transformers release")
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize(
     "size_factors",
@@ -185,21 +181,13 @@ def run_test(
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("num_frames", [16])
-def test_models(
-    hf_runner,
-    aphrodite_runner,
-    video_assets,
-    model,
-    size_factors,
-    dtype,
-    max_tokens,
-    num_logprobs,
-    num_frames,
-) -> None:
+def test_models(hf_runner, aphrodite_runner, video_assets, model, size_factors,
+                dtype, max_tokens, num_logprobs, num_frames) -> None:
     """Inference result should be the same between hf and aphrodite.
+
     All the image fixtures for the test is under tests/videos.
     For huggingface runner, we provide the np.ndarray as input.
-    For aphrodite runner, we provide MultiModalDataDict objects
+    For aphrodite runner, we provide MultiModalDataDict objects 
     and corresponding MultiModalConfig as input.
     Note, the text input is also adjusted to abide by aphrodite contract.
     The text output is sanitized to be able to compare with hf.
@@ -218,10 +206,8 @@ def test_models(
     )
 
 
-@pytest.mark.skipif(
-    transformers.__version__ < "4.45",
-    reason="Waiting for next transformers release",
-)
+@pytest.mark.skipif(transformers.__version__ < "4.45",
+                    reason="Waiting for next transformers release")
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize(
     "sizes",
@@ -231,17 +217,9 @@ def test_models(
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("num_frames", [16])
-def test_models_fixed_sizes(
-    hf_runner,
-    aphrodite_runner,
-    video_assets,
-    model,
-    sizes,
-    dtype,
-    max_tokens,
-    num_logprobs,
-    num_frames,
-) -> None:
+def test_models_fixed_sizes(hf_runner, aphrodite_runner, video_assets, model, sizes,
+                            dtype, max_tokens, num_logprobs,
+                            num_frames) -> None:
     run_test(
         hf_runner,
         aphrodite_runner,

+ 37 - 107
tests/models/test_minicpmv.py → tests/models/decoder_only/vision_language/test_minicpmv.py

@@ -1,17 +1,16 @@
-from typing import List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Type, Union
 
 import pytest
 import torch
 import torch.types
+from PIL import Image
 from transformers import BatchEncoding
 
-from aphrodite.common.sequence import SampleLogprobs
 from aphrodite.multimodal.utils import rescale_image_size
+from aphrodite.sequence import SampleLogprobs
 
-from ..conftest import IMAGE_ASSETS, AphroditeRunner, HfRunner, _ImageAssets
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
+from ....conftest import IMAGE_ASSETS, HfRunner, AphroditeRunner
+from ...utils import check_logprobs_close
 
 # The image token is placed before "user" on purpose so that the test can pass
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
@@ -24,6 +23,11 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
         "(<image>./</image>)\nWhat is the season?<|eot_id|>" \
         "<|start_header_id|>assistant<|end_header_id|>\n\n",
 })
+HF_MULTIIMAGE_IMAGE_PROMPT = \
+    "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" \
+    "(<image>./</image>)\n(<image>./</image>)\n" \
+    "Describe these images.<|eot_id|>" \
+    "<|start_header_id|>assistant<|end_header_id|>\n\n"
 
 models = ["openbmb/MiniCPM-Llama3-V-2_5"]
 
@@ -46,31 +50,26 @@ target_dtype = "half"
 def run_test(
     hf_runner: Type[HfRunner],
     aphrodite_runner: Type[AphroditeRunner],
-    image_assets: _ImageAssets,
+    inputs: List[Tuple[List[str], Union[List[Image.Image],
+                                        List[List[Image.Image]]]]],
     model: str,
     *,
-    size_factors: List[float],
     dtype: str,
     max_tokens: int,
     num_logprobs: int,
+    mm_limit: int,
     tensor_parallel_size: int,
     distributed_executor_backend: Optional[str] = None,
 ):
     """Inference result should be the same between hf and aphrodite.
 
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
     For huggingface runner, we provide the PIL images as input.
     For aphrodite runner, we provide MultiModalDataDict objects 
     and corresponding MultiModalConfig as input.
     Note, the text input is also adjusted to abide by aphrodite contract.
     The text output is sanitized to be able to compare with hf.
     """
-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
 
     # NOTE: take care of the order. run Aphrodite first, and then run HF.
     # Aphrodite needs a fresh new process without cuda initialization.
@@ -82,6 +81,7 @@ def run_test(
                      max_model_len=4096,
                      max_num_seqs=1,
                      dtype=dtype,
+                     limit_mm_per_prompt={"image": mm_limit},
                      tensor_parallel_size=tensor_parallel_size,
                      distributed_executor_backend=distributed_executor_backend,
                      enforce_eager=True) as aphrodite_model:
@@ -93,7 +93,7 @@ def run_test(
                                                 num_logprobs=num_logprobs,
                                                 images=images,
                                                 stop_token_ids=stop_token_ids)
-            for prompts, images in inputs_per_image
+            for prompts, images in inputs
         ]
 
     hf_model = hf_runner(model, dtype=dtype, postprocess_inputs=_wrap_inputs)
@@ -104,7 +104,7 @@ def run_test(
                                                     num_logprobs=num_logprobs,
                                                     images=images,
                                                     tokenizer=tokenizer)
-            for prompts, images in inputs_per_image
+            for prompts, images in inputs
         ]
 
     for hf_outputs, aphrodite_outputs in zip(hf_outputs_per_image,
@@ -138,104 +138,26 @@ def run_test(
 @pytest.mark.parametrize("num_logprobs", [5])
 def test_models(hf_runner, aphrodite_runner, image_assets, model, size_factors,
                 dtype: str, max_tokens: int, num_logprobs: int) -> None:
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
     run_test(
         hf_runner,
         aphrodite_runner,
-        image_assets,
+        inputs_per_image,
         model,
-        size_factors=size_factors,
         dtype=dtype,
         max_tokens=max_tokens,
         num_logprobs=num_logprobs,
+        mm_limit=1,
         tensor_parallel_size=1,
     )
 
 
-HF_MULTIIMAGE_IMAGE_PROMPT = \
-    "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" \
-    "(<image>./</image>)\n(<image>./</image>)\n" \
-    "Describe these images.<|eot_id|>" \
-    "<|start_header_id|>assistant<|end_header_id|>\n\n"
-
-
-def run_multi_image_test(
-    hf_runner: Type[HfRunner],
-    aphrodite_runner: Type[AphroditeRunner],
-    image_assets: _ImageAssets,
-    model: str,
-    *,
-    size_factors: List[float],
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    """Inference result should be the same between hf and aphrodite.
-
-    All the image fixtures for the test is under tests/images.
-    For huggingface runner, we provide the PIL images as input.
-    For aphrodite runner, we provide MultiModalDataDict objects 
-    and corresponding MultiModalConfig as input.
-    Note, the text input is also adjusted to abide by aphrodite contract.
-    The text output is sanitized to be able to compare with hf.
-    """
-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_per_case = [
-        ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
-         [[rescale_image_size(image, factor) for image in images]
-          for factor in size_factors])
-    ]
-
-    # NOTE: take care of the order. run Aphrodite first, and then run HF.
-    # Aphrodite needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-
-    # max_model_len should be greater than image_feature_size
-    with aphrodite_runner(model,
-                     max_model_len=4096,
-                     max_num_seqs=1,
-                     limit_mm_per_prompt={"image": len(images)},
-                     dtype=dtype,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as aphrodite_model:
-        tokenizer = aphrodite_model.model.get_tokenizer()
-        stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
-        aphrodite_outputs_per_case = [
-            aphrodite_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images,
-                                                stop_token_ids=stop_token_ids)
-            for prompts, images in inputs_per_case
-        ]
-
-    hf_model = hf_runner(model, dtype=dtype, postprocess_inputs=_wrap_inputs)
-    with hf_model, torch.no_grad():
-        hf_outputs_per_case = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    images=images,
-                                                    tokenizer=tokenizer)
-            for prompts, images in inputs_per_case
-        ]
-
-    for hf_outputs, aphrodite_outputs in zip(hf_outputs_per_case,
-                                        aphrodite_outputs_per_case):
-        check_logprobs_close(
-            outputs_0_lst=[
-                trunc_hf_output(hf_output) for hf_output in hf_outputs
-            ],
-            outputs_1_lst=aphrodite_outputs,
-            name_0="hf",
-            name_1="aphrodite",
-        )
-
-
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize(
     "size_factors",
@@ -256,14 +178,22 @@ def run_multi_image_test(
 def test_multi_images_models(hf_runner, aphrodite_runner, image_assets, model,
                              size_factors, dtype: str, max_tokens: int,
                              num_logprobs: int) -> None:
-    run_multi_image_test(
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_case = [
+        ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
+         [[rescale_image_size(image, factor) for image in images]
+          for factor in size_factors])
+    ]
+
+    run_test(
         hf_runner,
         aphrodite_runner,
-        image_assets,
+        inputs_per_case,
         model,
-        size_factors=size_factors,
         dtype=dtype,
         max_tokens=max_tokens,
         num_logprobs=num_logprobs,
+        mm_limit=2,
         tensor_parallel_size=1,
     )

+ 6 - 8
tests/models/test_paligemma.py → tests/models/decoder_only/vision_language/test_paligemma.py

@@ -4,14 +4,12 @@ from typing import List, Optional, Tuple, Type
 import pytest
 from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
 
-from aphrodite.common.sequence import SampleLogprobs
-from aphrodite.common.utils import is_hip
 from aphrodite.multimodal.utils import rescale_image_size
+from aphrodite.sequence import SampleLogprobs
+from aphrodite.utils import is_hip
 
-from ..conftest import IMAGE_ASSETS, AphroditeRunner, HfRunner, _ImageAssets
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
+from ....conftest import IMAGE_ASSETS, HfRunner, AphroditeRunner, _ImageAssets
+from ...utils import check_logprobs_close
 
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
     "stop_sign":
@@ -26,7 +24,7 @@ models = ["google/paligemma-3b-mix-224"]
 # excessive use of shared memory. Use other backends in the meantime.
 # FIXME (mattwong, gshtrasb, hongxiayan)
 if is_hip():
-    os.environ["APHRODITE_USE_TRITON_FLASH_ATTN"] = "0"
+    os.environ["Aphrodite_USE_TRITON_FLASH_ATTN"] = "0"
 
 
 def aphrodite_to_hf_output(aphrodite_output: Tuple[List[int], str,
@@ -69,7 +67,7 @@ def run_test(
 ):
     """Inference result should be the same between hf and aphrodite.
 
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
     For huggingface runner, we provide the PIL images as input.
     For aphrodite runner, we provide MultiModalDataDict objects 
     and corresponding MultiModalConfig as input.

+ 6 - 9
tests/models/test_phi3v.py → tests/models/decoder_only/vision_language/test_phi3v.py

@@ -5,15 +5,12 @@ from typing import List, Optional, Tuple, Type
 import pytest
 from transformers import AutoTokenizer
 
-from aphrodite.common.sequence import SampleLogprobs
-from aphrodite.common.utils import is_cpu, is_hip
 from aphrodite.multimodal.utils import rescale_image_size
+from aphrodite.sequence import SampleLogprobs
+from aphrodite.utils import is_cpu, is_hip
 
-from ..conftest import (IMAGE_ASSETS, AphroditeRunner, HfRunner,
-                        PromptImageInput)
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
+from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, AphroditeRunner
+from ...utils import check_logprobs_close
 
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
     "stop_sign":
@@ -72,7 +69,7 @@ def run_test(
 ):
     """Inference result should be the same between hf and aphrodite.
 
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
     For huggingface runner, we provide the PIL images as input.
     For aphrodite runner, we provide MultiModalDataDict objects 
     and corresponding MultiModalConfig as input.
@@ -106,7 +103,7 @@ def run_test(
     hf_model_kwargs = {"_attn_implementation": "eager"}
     with hf_runner(model, dtype=dtype,
                    model_kwargs=hf_model_kwargs) as hf_model:
-        eos_token_id = hf_model.tokenizer.eos_token_id
+        eos_token_id = hf_model.processor.tokenizer.eos_token_id
         hf_outputs_per_case = [
             hf_model.generate_greedy_logprobs_limit(prompts,
                                                     max_tokens,

+ 54 - 22
tests/models/test_pixtral.py → tests/models/decoder_only/vision_language/test_pixtral.py

@@ -1,10 +1,11 @@
-"""Compare the outputs of HF and Aphrodite for Pixtral models using greedy
-sampling.
-Run `pytest tests/models/test_pixtral.py`.
+"""Compare the outputs of HF and Aphrodite for Mistral models using greedy sampling.
+
+Run `pytest tests/models/test_mistral.py`.
 """
-import pickle
+import json
 import uuid
-from typing import Any, Dict, List
+from dataclasses import asdict
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
 
 import pytest
 from mistral_common.protocol.instruct.messages import ImageURLChunk
@@ -13,12 +14,16 @@ from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
 from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
 
 from aphrodite import AphroditeEngine, EngineArgs, SamplingParams
+from aphrodite.common.sequence import Logprob, SampleLogprobs
 from aphrodite.inputs import TokensPrompt
 from aphrodite.multimodal import MultiModalDataBuiltins
 
-from .utils import check_logprobs_close
+from ....utils import APHRODITE_PATH
+from ...utils import check_logprobs_close
+
+if TYPE_CHECKING:
+    from _typeshed import StrPath
 
-pytestmark = pytest.mark.vlm
 MODELS = ["mistralai/Pixtral-12B-2409"]
 IMG_URLS = [
     "https://picsum.photos/id/237/400/300",
@@ -81,13 +86,40 @@ SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
 LIMIT_MM_PER_PROMPT = dict(image=4)
 
 MAX_MODEL_LEN = [8192, 65536]
-FIXTURE_LOGPROBS_CHAT = "tests/models/fixtures/pixtral_chat.pickle"
-FIXTURE_LOGPROBS_ENGINE = "tests/models/fixtures/pixtral_chat_engine.pickle"
 
+FIXTURES_PATH = APHRODITE_PATH / "tests/models/fixtures"
+assert FIXTURES_PATH.exists()
+
+FIXTURE_LOGPROBS_CHAT = FIXTURES_PATH / "pixtral_chat.json"
+FIXTURE_LOGPROBS_ENGINE = FIXTURES_PATH / "pixtral_chat_engine.json"
+
+OutputsLogprobs = List[Tuple[List[int], str, Optional[SampleLogprobs]]]
+
+
+# For the test author to store golden output in JSON
+def _dump_outputs_w_logprobs(
+    outputs: OutputsLogprobs,
+    filename: "StrPath",
+) -> None:
+    json_data = [(tokens, text,
+                  [{k: asdict(v)
+                    for k, v in token_logprobs.items()}
+                   for token_logprobs in (logprobs or [])])
+                 for tokens, text, logprobs in outputs]
+
+    with open(filename, "w") as f:
+        json.dump(json_data, f)
+
+
+def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
+    with open(filename, "rb") as f:
+        json_data = json.load(f)
 
-def load_logprobs(filename: str) -> Any:
-    with open(filename, 'rb') as f:
-        return pickle.load(f)
+    return [(tokens, text,
+             [{int(k): Logprob(**v)
+               for k, v in token_logprobs.items()}
+              for token_logprobs in logprobs])
+            for tokens, text, logprobs in json_data]
 
 
 @pytest.mark.skip(
@@ -103,7 +135,7 @@ def test_chat(
     model: str,
     dtype: str,
 ) -> None:
-    EXPECTED_CHAT_LOGPROBS = load_logprobs(FIXTURE_LOGPROBS_CHAT)
+    EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT)
     with aphrodite_runner(
             model,
             dtype=dtype,
@@ -120,10 +152,10 @@ def test_chat(
             outputs.extend(output)
 
     logprobs = aphrodite_runner._final_steps_generate_w_logprobs(outputs)
-    check_logprobs_close(outputs_0_lst=logprobs,
-                         outputs_1_lst=EXPECTED_CHAT_LOGPROBS,
-                         name_0="output",
-                         name_1="h100_ref")
+    check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
+                         outputs_1_lst=logprobs,
+                         name_0="h100_ref",
+                         name_1="output")
 
 
 @pytest.mark.skip(
@@ -133,7 +165,7 @@ def test_chat(
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 def test_model_engine(aphrodite_runner, model: str, dtype: str) -> None:
-    EXPECTED_ENGINE_LOGPROBS = load_logprobs(FIXTURE_LOGPROBS_ENGINE)
+    EXPECTED_ENGINE_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_ENGINE)
     args = EngineArgs(
         model=model,
         tokenizer_mode="mistral",
@@ -162,7 +194,7 @@ def test_model_engine(aphrodite_runner, model: str, dtype: str) -> None:
             break
 
     logprobs = aphrodite_runner._final_steps_generate_w_logprobs(outputs)
-    check_logprobs_close(outputs_0_lst=logprobs,
-                         outputs_1_lst=EXPECTED_ENGINE_LOGPROBS,
-                         name_0="output",
-                         name_1="h100_ref")
+    check_logprobs_close(outputs_0_lst=EXPECTED_ENGINE_LOGPROBS,
+                         outputs_1_lst=logprobs,
+                         name_0="h100_ref",
+                         name_1="output")

+ 33 - 5
tests/models/test_qwen.py → tests/models/decoder_only/vision_language/test_qwen.py

@@ -10,11 +10,9 @@ from aphrodite.inputs import InputContext, LLMInputs
 from aphrodite.multimodal.base import MultiModalInputs
 from aphrodite.multimodal.utils import cached_get_tokenizer, rescale_image_size
 
-from ..conftest import (IMAGE_ASSETS, AphroditeRunner, HfRunner, ImageAsset,
-                        PromptImageInput, _ImageAssets)
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
+from ....conftest import (IMAGE_ASSETS, HfRunner, ImageAsset, PromptImageInput,
+                          AphroditeRunner, _ImageAssets)
+from ...utils import check_logprobs_close
 
 text_only_models = [
     "Qwen/Qwen-7B-Chat"  # Has no visual component
@@ -42,6 +40,8 @@ IMG_PAD_ID = 151859
 TOKS_PER_IMG = 256
 VIS_ENC_DIM = 4096
 IMG_SIZE = 448
+
+
 def build_model_context(model_name: str,
                         tokenizer_name: Optional[str] = None,
                         trust_remote_code: bool = False):
@@ -51,6 +51,7 @@ def build_model_context(model_name: str,
         model_name: Name of the model being considered.
         tokenizer_name: Name of the tokenizer being considered.
         trust_remote_code: Whether or not to allow loading remote code.
+
     Returns:
         InputContext for the model being considered.
     """
@@ -65,21 +66,29 @@ def build_model_context(model_name: str,
         seed=0,
     )
     return InputContext(model_config)
+
+
 @pytest.fixture()
 def input_mapper_for_qwen():
     # Lazy import to avoid initializing CUDA during test collection
     from aphrodite.modeling.models.qwen import input_mapper_for_qwen
     return input_mapper_for_qwen
+
+
 @pytest.fixture()
 def input_processor_for_qwen():
     # Lazy import to avoid initializing CUDA during test collection
     from aphrodite.modeling.models.qwen import input_processor_for_qwen
     return input_processor_for_qwen
+
+
 @pytest.fixture()
 def qwen_vl_context() -> InputContext:
     """Get an InputContext for Qwen-VL."""
     return build_model_context(model_name="Qwen/Qwen-VL",
                                trust_remote_code=True)
+
+
 # Happy path tests for single/multi-image scenarios for the multimodal
 # input processor and mapper, respectively
 @pytest.mark.parametrize("num_images", [1, 2])
@@ -99,11 +108,14 @@ def test_input_processor_valid_mm_data(input_processor_for_qwen,
     )
     proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs)
     assert isinstance(proc_inputs, dict)
+
     # Each image should have one start / stop and a fixed context of 256
     proc_tokens = proc_inputs["prompt_token_ids"]
     assert proc_tokens.count(IMG_START_ID) == num_images
     assert proc_tokens.count(IMG_END_ID) == num_images
     assert proc_tokens.count(IMG_PAD_ID) == num_images * TOKS_PER_IMG
+
+
 @pytest.mark.parametrize(
     "img_data,expected_shape",
     [
@@ -130,6 +142,8 @@ def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
     assert isinstance(mapped_img_data, MultiModalInputs)
     assert "pixel_values" in mapped_img_data
     assert mapped_img_data["pixel_values"].shape == expected_shape
+
+
 # Sad path tests for the multimodal input processor and mapper, respectively
 @pytest.mark.parametrize("mm_data", [
     {
@@ -153,6 +167,8 @@ def test_input_processor_invalid_mm_data(input_processor_for_qwen,
     # Should fail since we have too many or too few dimensions for embeddings
     with pytest.raises(ValueError):
         input_processor_for_qwen(qwen_vl_context, inputs)
+
+
 @pytest.mark.parametrize(
     "img_data",
     [
@@ -169,6 +185,8 @@ def test_input_mapper_invalid_mm_data(
     """Sad cases validated in Qwen VL's multimodal input mapper."""
     with pytest.raises(ValueError):
         input_mapper_for_qwen(qwen_vl_context, img_data)
+
+
 ### End-to-end generation tests
 def get_prompt_with_path(tmp_path: pathlib.PosixPath, prompt: str,
                          assets: Union[_ImageAssets, List[ImageAsset]]) -> str:
@@ -176,6 +194,7 @@ def get_prompt_with_path(tmp_path: pathlib.PosixPath, prompt: str,
     tempdir & replace its contents with the local path to the string so that
     the HF version of Qwen-VL can resolve the path and load the image ni its
     forward() call.
+
     Args:
         tmp_path: Tempdir for test under consideration.
         prompt: Prompt with image placeholders.
@@ -184,6 +203,7 @@ def get_prompt_with_path(tmp_path: pathlib.PosixPath, prompt: str,
     # Ensure that the number of placeholders matches the number of assets;
     # If this is not true, the test is probably written incorrectly.
     assert prompt.count("<img></img>") == len(assets)
+
     # Replace the placeholders with local paths to the exported assets
     for asset in assets:
         image_tmp_path = tmp_path / f"{asset.name}.jpg"
@@ -195,6 +215,7 @@ def get_prompt_with_path(tmp_path: pathlib.PosixPath, prompt: str,
         )
     return prompt
 
+
 def run_test(
     hf_runner: Type[HfRunner],
     aphrodite_runner: Type[AphroditeRunner],
@@ -209,6 +230,7 @@ def run_test(
     distributed_executor_backend: Optional[str] = None,
 ):
     """Inference result should be the same between hf and aphrodite.
+
     All the image fixtures for the test is under tests/images.
     For huggingface runner, we provide the PIL images as input.
     For aphrodite runner, we provide MultiModalDataDict objects
@@ -286,14 +308,17 @@ def test_multimodal_models_single_image(tmp_path: pathlib.PosixPath,
                                         num_logprobs: int) -> None:
     """Tests multimodal models with single image prompts."""
     images = [asset.pil_image for asset in image_assets]
+
     prompts = [
         get_prompt_with_path(tmp_path, prompt, [asset])
         for prompt, asset in zip(HF_IMAGE_PROMPTS, image_assets)
     ]
+
     inputs = [(
         [prompt for _ in size_factors],
         [rescale_image_size(image, factor) for factor in size_factors],
     ) for image, prompt in zip(images, prompts)]
+
     run_test(
         hf_runner,
         aphrodite_runner,
@@ -305,6 +330,8 @@ def test_multimodal_models_single_image(tmp_path: pathlib.PosixPath,
         mm_limit=1,
         tensor_parallel_size=1,
     )
+
+
 @pytest.mark.parametrize("model", multimodal_models)
 @pytest.mark.parametrize(
     "size_factors",
@@ -337,6 +364,7 @@ def test_multimodal_models_multi_image(tmp_path: pathlib.PosixPath,
     inputs = [([prompt for _ in size_factors],
                [[rescale_image_size(image, factor) for image in images]
                 for factor in size_factors])]
+
     run_test(
         hf_runner,
         aphrodite_runner,

+ 0 - 0
tests/models/embedding/__init__.py


+ 0 - 0
tests/models/embedding/language/__init__.py


+ 1 - 1
tests/models/test_embedding.py → tests/models/embedding/language/test_embedding.py

@@ -1,4 +1,4 @@
-"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
+"""Compare the outputs of HF and Aphrodite for Mistral models using greedy sampling.
 
 Run `pytest tests/models/test_llama_embedding.py`.
 """

+ 0 - 0
tests/models/encoder_decoder/__init__.py


+ 0 - 0
tests/models/encoder_decoder/language/__init__.py


+ 102 - 53
tests/models/test_bart.py → tests/models/encoder_decoder/language/test_bart.py

@@ -1,9 +1,8 @@
-"""Compare the outputs of HF and Aphrodite for BART models using greedy
-sampling.
+"""Compare the outputs of HF and Aphrodite for BART models using greedy sampling.
 
-Run `pytest tests/models/test_bart.py`.
+Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
 """
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Type
 
 from aphrodite.common.utils import is_cpu
 
@@ -17,8 +16,10 @@ if not is_cpu():
 
     from aphrodite.common.sequence import SampleLogprobs
 
-    from ..conftest import DecoderPromptType
-    from .utils import check_logprobs_close
+    from ....conftest import (AphroditeRunner, DecoderPromptType,
+                              ExplicitEncoderDecoderPrompt, HfRunner)
+    from ....utils import multi_gpu_test
+    from ...utils import check_logprobs_close
 
     MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"]
 
@@ -35,24 +36,22 @@ if not is_cpu():
 
         return output_ids, hf_output_str, out_logprobs
 
-    @pytest.mark.parametrize("model", MODELS)
-    @pytest.mark.parametrize("dtype", ["float", "bfloat16"])
-    @pytest.mark.parametrize("max_tokens", [64])
-    @pytest.mark.parametrize("num_logprobs", [5])
-    @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
-    def test_models(
-        hf_runner,
-        aphrodite_runner,
-        example_encoder_decoder_prompts,
+    def run_test(
+        hf_runner: Type[HfRunner],
+        aphrodite_runner: Type[AphroditeRunner],
+        prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
+        decoder_prompt_type: DecoderPromptType,
         model: str,
+        *,
         dtype: str,
         max_tokens: int,
         num_logprobs: int,
-        decoder_prompt_type: DecoderPromptType,
+        tensor_parallel_size: int,
+        distributed_executor_backend: Optional[str] = None,
     ) -> None:
         '''
-        Test the Aphrodite BART model for a variety of encoder/decoder
-        input prompts, by validating it against HuggingFace (HF) BART.
+        Test the Aphrodite BART model for a variety of encoder/decoder input prompts,
+        by validating it against HuggingFace (HF) BART.
 
         Arguments:
 
@@ -88,23 +87,22 @@ if not is_cpu():
           then (4) after computing logits during prefill, override the model
           logits & force <BOS> to be the first generated token.
         
-        * Aphrodite will (1) tokenize the None prompt as [<BOS>], (2) append
-          decoder-start-token to the beginning, yielding
-          [<decoder-start-token><BOS>], (3) pass these tokens to the model &
-          proceed with generation.
+        * Aphrodite will (1) tokenize the None prompt as [<BOS>], (2) append decoder-
+          start-token to the beginning, yielding [<decoder-start-token><BOS>],
+          (3) pass these tokens to the model & proceed with generation.
+        
+        The net effect is that compared to Aphrodite, the list of HF *decoded* tokens
+        will contain one more initial <BOS> than the Aphrodite generated tokens,
+        because Aphrodite's <BOS> token is injected into the prompt rather than into
+        the generated output. This is in spite of the fact that overall, the
+        complete sequences (prompt + decoded tokens) produced by Aphrodite will match
+        HF.
+        
+        So when we use HF decoded token output to validate Aphrodite's decoded token
+        output, the testing process must account for the difference in decoded
+        token sequences between Aphrodite and HF specifically in the
+        decoder-prompt-is-None case. 
         
-        The net effect is that compared to Aphrodite, the list of HF
-        *decoded* tokens will contain one more initial <BOS> than the
-        Aphrodite generated tokens, because Aphrodite's <BOS> token is
-        injected into the prompt rather than into the generated output.
-        This is in spite of the fact that overall, the complete sequences
-        (prompt + decoded tokens) produced by Aphrodite will match HF.
-
-        So when we use HF decoded token output to validate Aphrodite's decoded
-        token output, the testing process must account for the difference in
-        decoded token sequences between Aphrodite and HF specifically in the
-        decoder-prompt-is-None case.
-
         One option is to disable the logit processor feature that forces the
         <BOS> token to be decoded (forced_bos_token_id = None), eliminating
         the problem entirely. However this is not "normal" BART usage.
@@ -118,8 +116,29 @@ if not is_cpu():
         token during the process of validating the Aphrodite decoded output.
         '''
 
-        test_case_prompts = example_encoder_decoder_prompts[
-            decoder_prompt_type]
+        # NOTE: take care of the order. run Aphrodite first, and then run HF.
+        # Aphrodite needs a fresh new process without cuda initialization.
+        # if we run HF first, the cuda initialization will be done and it
+        # will hurt multiprocessing backend with fork method (the default).
+
+        # Note: currently encoder/decoder models are only compatible with
+        # enforce_eager=True. Normally this is not a problem because
+        # for encoder/decoder models Aphrodite will
+        # default to enforce_eager=True if enforce_eager
+        # is left unspecified. However, the
+        # AphroditeRunner test fixture (which wraps around the LLM class) defaults to
+        # enforce_eager=False (a behavior which a number of already-exisitng
+        # decoder-only unit tests expect), so when testing an encoder/decoder
+        # model we must explicitly specify enforce_eager=True in the AphroditeRunner
+        # constructor.
+        with aphrodite_runner(
+                model,
+                dtype=dtype,
+                tensor_parallel_size=tensor_parallel_size,
+                distributed_executor_backend=distributed_executor_backend,
+                enforce_eager=True) as aphrodite_model:
+            aphrodite_outputs = aphrodite_model.generate_encoder_decoder_greedy_logprobs(
+                prompts, max_tokens, num_logprobs)
 
         # Configuration settings for HF baseline
         hf_kwargs = {
@@ -137,28 +156,12 @@ if not is_cpu():
                        auto_cls=AutoModelForSeq2SeqLM) as hf_model:
             hf_outputs = (
                 hf_model.generate_encoder_decoder_greedy_logprobs_limit(
-                    test_case_prompts,
+                    prompts,
                     max_tokens,
                     num_logprobs,
                     **hf_kwargs,
                 ))
 
-        # Note: currently encoder/decoder models are only compatible with
-        # enforce_eager=True. Normally this is not a problem because
-        # for encoder/decoder models Aphrodite will
-        # default to enforce_eager=True if enforce_eager
-        # is left unspecified. However, the
-        # AphroditeRunner test fixture (which wraps around the LLM class)
-        # defaults to enforce_eager=False (a behavior which a number of
-        # already-exisitng decoder-only unit tests expect), so when testing
-        # an encoder/decoder model we must explicitly specify enforce_eager=True
-        # in the AphroditeRunner constructor.
-        with aphrodite_runner(model, dtype=dtype,
-                              enforce_eager=True) as aphrodite_model:
-            aphrodite_outputs = (
-                aphrodite_model.generate_encoder_decoder_greedy_logprobs(
-                    test_case_prompts, max_tokens, num_logprobs))
-
         hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE
                           else 0)
 
@@ -172,3 +175,49 @@ if not is_cpu():
             name_1="aphrodite",
             num_outputs_0_skip_tokens=hf_skip_tokens,
         )
+
+    @pytest.mark.parametrize("model", MODELS)
+    @pytest.mark.parametrize("dtype", ["float", "bfloat16"])
+    @pytest.mark.parametrize("max_tokens", [64])
+    @pytest.mark.parametrize("num_logprobs", [5])
+    @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
+    def test_models(hf_runner, aphrodite_runner, example_encoder_decoder_prompts,
+                    model, dtype, max_tokens, num_logprobs,
+                    decoder_prompt_type) -> None:
+
+        run_test(
+            hf_runner,
+            aphrodite_runner,
+            example_encoder_decoder_prompts[decoder_prompt_type],
+            decoder_prompt_type,
+            model,
+            dtype=dtype,
+            max_tokens=max_tokens,
+            num_logprobs=num_logprobs,
+            tensor_parallel_size=1,
+        )
+
+    @multi_gpu_test(num_gpus=2)
+    @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
+    @pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
+    @pytest.mark.parametrize("dtype", ["float"])
+    @pytest.mark.parametrize("max_tokens", [64])
+    @pytest.mark.parametrize("num_logprobs", [5])
+    @pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM])
+    def test_models_distributed(hf_runner, aphrodite_runner,
+                                example_encoder_decoder_prompts,
+                                distributed_executor_backend, model, dtype,
+                                max_tokens, num_logprobs,
+                                decoder_prompt_type) -> None:
+        run_test(
+            hf_runner,
+            aphrodite_runner,
+            example_encoder_decoder_prompts[decoder_prompt_type],
+            decoder_prompt_type,
+            model,
+            dtype=dtype,
+            max_tokens=max_tokens,
+            num_logprobs=num_logprobs,
+            tensor_parallel_size=2,
+            distributed_executor_backend=distributed_executor_backend,
+        )

파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 0 - 0
tests/models/fixtures/pixtral_chat.json


BIN
tests/models/fixtures/pixtral_chat.pickle


파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 0 - 0
tests/models/fixtures/pixtral_chat_engine.json


BIN
tests/models/fixtures/pixtral_chat_engine.pickle


+ 0 - 118
tests/models/test_fp8.py

@@ -1,118 +0,0 @@
-# flake8: noqa
-"""Tests fp8 models against ground truth generation
-Note: these tests will only pass on L4 GPU.
-"""
-import os
-from typing import List
-
-import pytest
-import torch
-from transformers import AutoTokenizer
-
-from aphrodite import LLM, SamplingParams
-from tests.quantization.utils import is_quant_method_supported
-
-os.environ["TOKENIZERS_PARALLELISM"] = "true"
-
-MAX_MODEL_LEN = 1024
-
-MODELS = [
-    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV",
-    "meta-llama/Meta-Llama-3-8B-Instruct",
-]
-
-EXPECTED_STRS_MAP = {
-    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV": {
-        "auto": [
-            'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (',
-            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-            'Artificial intelligence (AI) and human intelligence (HI) process information in distinct ways, with both',
-            'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
-            'Zeta-5, a highly advanced robot designed for menial labor, whirred and beep',
-            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
-            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-            'Here are the translations:\n\n**Japanese:** (Haya aki no tori, nemuri no'
-        ],
-        "fp8": [
-            'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
-            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-            'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
-            'A neural network is a complex system made up of several basic components that work together to enable it to',
-            'Zeta-5, a highly advanced robot designed for menial labor, had never experienced anything like',
-            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. Here',
-            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-            'Here are the translations:\n\n**Japanese:** (Haya kotori wa mushi o tsuk'
-        ]
-    },
-    "meta-llama/Meta-Llama-3-8B-Instruct": {
-        "auto": [
-            'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
-            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-            'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
-            'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
-            'In the vast, sterile laboratory, Robot 3456-Alpha, or "Alpha" for short',
-            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
-            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-            'Here are the translations:\n\n**Japanese:** (Haya aki wa mushi o tsukamu'
-        ],
-        "fp8": [
-            'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
-            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-            'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
-            'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
-            'In the year 2154, robotics engineer Dr. Rachel Kim had spent years perfecting her latest',
-            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
-            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-            'Here are the translations:\n\n**Japanese:** (Haya tori, mushi o tsukamu'
-        ]
-    },
-}
-
-
-# This test compares against golden strings for exact match since
-# there is no baseline implementation to compare against
-# and is unstable w.r.t specifics of the fp8 implementation or
-# the hardware being run on.
-# Disabled to prevent it from breaking the build
-@pytest.mark.skip(
-    reason=
-    "Prevent unstable test based on golden strings from breaking the build.")
-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
-                    reason="fp8 is not supported on this GPU type.")
-@pytest.mark.parametrize("model_name", MODELS)
-@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
-def test_models(example_prompts, model_name, kv_cache_dtype) -> None:
-    model = LLM(model=model_name,
-                max_model_len=MAX_MODEL_LEN,
-                trust_remote_code=True,
-                enforce_eager=True,
-                quantization="fp8",
-                kv_cache_dtype=kv_cache_dtype)
-
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    formatted_prompts = [
-        tokenizer.apply_chat_template([{
-            "role": "user",
-            "content": prompt
-        }],
-                                      tokenize=False,
-                                      add_generation_prompt=True)
-        for prompt in example_prompts
-    ]
-
-    params = SamplingParams(max_tokens=20, temperature=0)
-    generations: List[str] = []
-    # Note: these need to be run 1 at a time due to numerical precision,
-    # since the expected strs were generated this way.
-    for prompt in formatted_prompts:
-        outputs = model.generate(prompt, params)
-        generations.append(outputs[0].outputs[0].text)
-    del model
-
-    print(model_name, kv_cache_dtype, generations)
-    expected_strs = EXPECTED_STRS_MAP[model_name][kv_cache_dtype]
-    for i in range(len(example_prompts)):
-        generated_str = generations[i]
-        expected_str = expected_strs[i]
-        assert expected_str == generated_str, (
-            f"Test{i}:\nExpected: {expected_str!r}\nAphrodite: {generated_str!r}")

+ 5 - 0
tests/models/test_registry.py

@@ -1,9 +1,14 @@
 import pytest
+import transformers
 
 from aphrodite.modeling.models import _MODELS, ModelRegistry
 
 
 @pytest.mark.parametrize("model_cls", _MODELS)
 def test_registry_imports(model_cls):
+    if (model_cls == "Qwen2VLForConditionalGeneration"
+            and transformers.__version__ < "4.45"):
+        pytest.skip("Waiting for next transformers release")
+
     # Ensure all model classes can be imported successfully
     ModelRegistry.resolve_model_cls([model_cls])

+ 4 - 1
tests/models/utils.py

@@ -67,7 +67,6 @@ TokensTextLogprobsPromptLogprobs = Tuple[
     Optional[Union[List[Optional[Dict[int, float]]], PromptLogprobs]]]
 
 
-
 def check_logprobs_close(
     *,
     outputs_0_lst: Sequence[Union[TokensTextLogprobs,
@@ -91,6 +90,7 @@ def check_logprobs_close(
     * `always_check_logprobs == False`: highest-logprob token ids are
       only compared at sampled token offsets for which generated token
       ids don't match
+
     Prompt logprobs must be provided either for both input sequences, or
     for neither. If prompt logprobs are provided, then highest-logprob
     prompt token ids must match between seq0 and seq1 at all prompt token
@@ -139,6 +139,7 @@ def check_logprobs_close(
                 logprobs_1,
                 prompt_logprobs_1,
             ) = outputs_1
+
             # Test prompt logprobs closeness
             if (prompt_logprobs_0 is not None
                     and prompt_logprobs_1 is not None):
@@ -151,6 +152,7 @@ def check_logprobs_close(
                         f"Prompt logprobs test:"
                         f"\n{name_0}:\tPrompt index {idx}\t{logprobs_elem_0}"
                         f"\n{name_1}:\tPrompt index {idx}\t{logprobs_elem_1}")
+
                     if logprobs_elem_0 is None:
                         # If the seq 0 token's logprobs are `None`,
                         # the seq 1 token's logprobs must be `None`
@@ -167,6 +169,7 @@ def check_logprobs_close(
                 fail_msg = (f"Prompt logprobs test:"
                             f"\n{name_0}:\tlogprobs\t{prompt_logprobs_0}"
                             f"\n{name_1}:\tlogprobs\t{prompt_logprobs_1}")
+
                 assert (prompt_logprobs_0 is None
                         and prompt_logprobs_1 is None), fail_msg
         else:

+ 35 - 11
tests/utils.py

@@ -10,18 +10,20 @@ from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional
 
 import openai
+import pytest
 import requests
 from openai.types.completion import Completion
 from transformers import AutoTokenizer
 from typing_extensions import ParamSpec
 
-from aphrodite.common.utils import (FlexibleArgumentParser, get_open_port,
+from aphrodite.common.utils import (FlexibleArgumentParser,
+                                    cuda_device_count_stateless, get_open_port,
                                     is_hip)
 from aphrodite.distributed import (ensure_model_parallel_initialized,
                                    init_distributed_environment)
 from aphrodite.endpoints.openai.args import make_arg_parser
 from aphrodite.engine.args_tools import AsyncEngineArgs
-from aphrodite.modeling.model_loader.loader import DefaultModelLoader
+from aphrodite.modeling.model_loader.loader import get_model_loader
 from aphrodite.platforms import current_platform
 from tests.models.utils import TextTextLogprobs
 
@@ -90,11 +92,11 @@ class RemoteOpenAIServer:
         is_local = os.path.isdir(model)
         if not is_local:
             engine_args = AsyncEngineArgs.from_cli_args(args)
-            engine_config = engine_args.create_engine_config()
-            dummy_loader = DefaultModelLoader(engine_config.load_config)
-            dummy_loader._prepare_weights(engine_config.model_config.model,
-                                          engine_config.model_config.revision,
-                                          fall_back_to_pt=True)
+            model_config = engine_args.create_model_config()
+            load_config = engine_args.create_load_config()
+
+            model_loader = get_model_loader(load_config)
+            model_loader.download_model(model_config)
 
         env = os.environ.copy()
         # the current process might initialize cuda,
@@ -103,7 +105,7 @@ class RemoteOpenAIServer:
         if env_dict is not None:
             env.update(env_dict)
         self.proc = subprocess.Popen(
-            ["aphrodite", "run", model, *aphrodite_serve_args],
+            ["aphrodite", "serve", model, *aphrodite_serve_args],
             env=env,
             stdout=sys.stdout,
             stderr=sys.stderr,
@@ -118,7 +120,7 @@ class RemoteOpenAIServer:
     def __exit__(self, exc_type, exc_value, traceback):
         self.proc.terminate()
         try:
-            self.proc.wait(3)
+            self.proc.wait(8)
         except subprocess.TimeoutExpired:
             # force kill if needed
             self.proc.kill()
@@ -179,7 +181,12 @@ def compare_two_settings(model: str,
         env2: The second set of environment variables to pass to the API server.
     """
 
-    tokenizer = AutoTokenizer.from_pretrained(model)
+    trust_remote_code = "--trust-remote-code"
+    if trust_remote_code in arg1 or trust_remote_code in arg2:
+        tokenizer = AutoTokenizer.from_pretrained(model,
+                                                  trust_remote_code=True)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(model)
 
     prompt = "Hello, my name is"
     token_ids = tokenizer(prompt)["input_ids"]
@@ -356,6 +363,7 @@ def get_physical_device_indices(devices):
     visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
     if visible_devices is None:
         return devices
+
     visible_indices = [int(x) for x in visible_devices.split(",")]
     index_mapping = {i: physical for i, physical in enumerate(visible_indices)}
     return [index_mapping[i] for i in devices if i in index_mapping]
@@ -446,6 +454,22 @@ def fork_new_process_for_each_test(
     return wrapper
 
 
+def multi_gpu_test(*, num_gpus: int):
+    """
+    Decorate a test to be run only when multiple GPUs are available.
+    """
+    test_selector = getattr(pytest.mark, f"distributed_{num_gpus}_gpus")
+    test_skipif = pytest.mark.skipif(
+        cuda_device_count_stateless() < num_gpus,
+        reason=f"Need at least {num_gpus} GPUs to run the test.",
+    )
+
+    def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
+        return test_selector(test_skipif(fork_new_process_for_each_test(f)))
+
+    return wrapper
+
+
 async def completions_with_server_args(
     prompts: List[str],
     model_name: str,
@@ -469,7 +493,7 @@ async def completions_with_server_args(
     '''
 
     outputs = None
-    max_wait_seconds = 240 * 3  # 240 is the default
+    max_wait_seconds = 240 * 3  # 240 is default
     with RemoteOpenAIServer(model_name,
                             server_cli_args,
                             max_wait_seconds=max_wait_seconds) as server:

이 변경점에서 너무 많은 파일들이 변경되어 몇몇 파일들은 표시되지 않았습니다.