"""Tests which cover integration of the speculative decoding framework with
tensor parallelism.

import pytest
import torch

from aphrodite.common.utils import is_hip

from .conftest import run_greedy_equality_correctness_test

@pytest.mark.skipif(torch.cuda.device_count() < 2,
                    reason="Need at least 2 GPUs to run the test.")
        "model": "JackFram/llama-68m",

        # Skip cuda graph recording for fast test.
        "enforce_eager": True,

        # Required for spec decode.
        "use_v2_block_manager": True,
        "tensor_parallel_size": 2,

        # Use AsyncLLM engine, so that the engine runs in its own process.
        # Otherwise, since aphrodite does not follow true SPMD, the test runner
        # process will have both the engine and the rank0 worker. NCCL is not
        # cleaned up properly, and its server host thread leaks, causing the
        # second run of the test to fail with internal NCCL error.
        "use_async": True,
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
        "speculative_model": "JackFram/llama-68m",
        "num_speculative_tokens": 3,
        "speculative_model": "[ngram]",
        "num_speculative_tokens": 5,
        "ngram_prompt_lookup_max": 3,
@pytest.mark.parametrize("batch_size", [2])
        # Use smaller output len for fast test.
@pytest.mark.parametrize("seed", [1])
def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator,
                              batch_size: int, output_len: int):
    """Verify greedy equality when tensor parallelism is used.
    if is_hip():
        pytest.skip("hip is not well-supported yet")

@pytest.mark.skipif(torch.cuda.device_count() < 2,
                    reason="Need at least 2 GPUs to run the test.")
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,

        # Required for spec decode.
        "use_v2_block_manager": True,
        "tensor_parallel_size": 2,

        # Use AsyncLLM engine, so that the engine runs in its own process.
        # Otherwise, since aphrodite does not follow true SPMD, the test runner
        # process will have both the engine and the rank0 worker. NCCL is not
        # cleaned up properly, and its server host thread leaks, causing the
        # second run of the test to fail with internal NCCL error.
        "use_async": True,

        # precision
        "dtype": "float32",
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
    "per_test_common_llm_kwargs, test_llm_kwargs",
                # Use a small model for a fast test.
                # Note this is repeated in the test body; to initialize a
                # tokenizer.
                "model": "JackFram/llama-68m",
                "speculative_model": "JackFram/llama-68m",
                "num_speculative_tokens": 5,
                "speculative_draft_tensor_parallel_size": 1,
            "model": "ibm-granite/granite-3b-code-instruct",
        }, {
            "num_speculative_tokens": 5,
            "speculative_draft_tensor_parallel_size": 1,
@pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize("seed", [1])
def test_draft_model_tp_lt_target_model_tp2(test_llm_generator,
                                            batch_size: int):
    """Verify spec decode works well with smaller tp for draft models.