david
/
aphrodite-engine
mirror of https://github.com/PygmalionAI/aphrodite-engine


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
							"""
This example shows how to use the multi-LoRA functionality for offline
inference. Requires HuggingFace credentials for access to Llama2.
"""

import asyncio
from typing import List, Optional, Tuple

from aphrodite import AsyncAphrodite, AsyncEngineArgs, SamplingParams
from aphrodite.lora.request import LoRARequest


def create_test_prompts(
        lora_path: str
        ) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]:
    """Create a list of test prompts with their sampling parameters.
    
    2 requests for base model, 4 requests for the LoRA. We define 2
    different LoRA adapters (using the same model for demo purposes).
    Since we also set `max_loras=1`, the expectation is that the requests
    with the second LoRA adapter will be ran after all requests with the
    first adapter have finished.
    """
    return [
        (
            "A robot may not injure a human being",
            SamplingParams(
                temperature=0.0,
                # logprobs=1,
                prompt_logprobs=1,
                max_tokens=128),
            None),
        ("To be or not to be,",
         SamplingParams(temperature=0.8,
                        top_k=5,
                        presence_penalty=0.2,
                        max_tokens=128), None),
        (
            """[user] Write a SQL query to answer the question based on the
            table schema.\n\n context: CREATE TABLE table_name_74
            (icao VARCHAR, airport VARCHAR)\n\n
            question: Name the ICAO for lilongwe
            international airport [/user] [assistant]""",
            SamplingParams(
                temperature=0.0,
                # logprobs=1,
                prompt_logprobs=1,
                max_tokens=128,
                stop_token_ids=[32003]),
            LoRARequest(
                lora_name="l2-lora-test",
                lora_int_id=1,
                lora_path=lora_path
            )),
        ("""[user] Write a SQL query to answer the question based on the table
         schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR,
         elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector
         what is under nationality? [/user] [assistant]""",
         SamplingParams(n=3,
                        best_of=3,
                        temperature=0.8,
                        max_tokens=128,
                        stop_token_ids=[32003]),
         LoRARequest(
             lora_name="l2-lora-test",
             lora_int_id=1,
             lora_path=lora_path
         )),
        (
            """[user] Write a SQL query to answer the question based on the
            table schema.\n\n context: CREATE TABLE table_name_74 (icao
            VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe
            international airport [/user] [assistant]""",
            SamplingParams(
                temperature=0.0,
                # logprobs=1,
                prompt_logprobs=1,
                max_tokens=128,
                stop_token_ids=[32003]),
            LoRARequest(
                lora_name="l2-lora-test2",
                lora_int_id=2,
                lora_path=lora_path
            )),
        ("""[user] Write a SQL query to answer the question based on the table
         schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR,
         elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector
         what is under nationality? [/user] [assistant]""",
         SamplingParams(n=3,
                        best_of=3,
                        temperature=0.9,
                        max_tokens=128,
                        stop_token_ids=[32003]),
         LoRARequest(
             lora_name="l2-lora-test",
             lora_int_id=1,
             lora_path=lora_path
         )),
    ]  # type: ignore


async def process_requests(engine: AsyncAphrodite,
                         test_prompts: List[Tuple[str, SamplingParams,
                                                Optional[LoRARequest]]]):
    """Continuously process a list of prompts and handle the outputs."""
    request_id = 0
    active_requests = []

    for prompt, sampling_params, lora_request in test_prompts:
        request_generator = engine.generate(
            prompt,
            sampling_params,
            str(request_id),
            lora_request=lora_request
        )
        active_requests.append(request_generator)
        request_id += 1

    # Process all requests
    for request_generator in active_requests:
        # Don't await the generator itself, just iterate over it
        async for request_output in request_generator:
            if request_output.finished:
                print(request_output)


def initialize_engine() -> AsyncAphrodite:
    """Initialize the AsyncAphrodite."""
    # Function remains unchanged as it's just initialization
    engine_args = AsyncEngineArgs(model="NousResearch/Llama-2-7b-hf",
                           enable_lora=True,
                           max_loras=1,
                           max_lora_rank=8,
                           max_cpu_loras=2,
                           max_num_seqs=256)
    return AsyncAphrodite.from_engine_args(engine_args)


async def main():
    """Main function that sets up and runs the prompt processing."""
    engine = initialize_engine()
    test_prompts = create_test_prompts("alpindale/l2-lora-test")
    await process_requests(engine, test_prompts)


if __name__ == '__main__':
    asyncio.run(main())