123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147 |
- """
- This example shows how to use the multi-LoRA functionality for offline
- inference. Requires HuggingFace credentials for access to Llama2.
- """
- import asyncio
- from typing import List, Optional, Tuple
- from aphrodite import AsyncAphrodite, AsyncEngineArgs, SamplingParams
- from aphrodite.lora.request import LoRARequest
- def create_test_prompts(
- lora_path: str
- ) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]:
- """Create a list of test prompts with their sampling parameters.
-
- 2 requests for base model, 4 requests for the LoRA. We define 2
- different LoRA adapters (using the same model for demo purposes).
- Since we also set `max_loras=1`, the expectation is that the requests
- with the second LoRA adapter will be ran after all requests with the
- first adapter have finished.
- """
- return [
- (
- "A robot may not injure a human being",
- SamplingParams(
- temperature=0.0,
- # logprobs=1,
- prompt_logprobs=1,
- max_tokens=128),
- None),
- ("To be or not to be,",
- SamplingParams(temperature=0.8,
- top_k=5,
- presence_penalty=0.2,
- max_tokens=128), None),
- (
- """[user] Write a SQL query to answer the question based on the
- table schema.\n\n context: CREATE TABLE table_name_74
- (icao VARCHAR, airport VARCHAR)\n\n
- question: Name the ICAO for lilongwe
- international airport [/user] [assistant]""",
- SamplingParams(
- temperature=0.0,
- # logprobs=1,
- prompt_logprobs=1,
- max_tokens=128,
- stop_token_ids=[32003]),
- LoRARequest(
- lora_name="l2-lora-test",
- lora_int_id=1,
- lora_path=lora_path
- )),
- ("""[user] Write a SQL query to answer the question based on the table
- schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR,
- elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector
- what is under nationality? [/user] [assistant]""",
- SamplingParams(n=3,
- best_of=3,
- temperature=0.8,
- max_tokens=128,
- stop_token_ids=[32003]),
- LoRARequest(
- lora_name="l2-lora-test",
- lora_int_id=1,
- lora_path=lora_path
- )),
- (
- """[user] Write a SQL query to answer the question based on the
- table schema.\n\n context: CREATE TABLE table_name_74 (icao
- VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe
- international airport [/user] [assistant]""",
- SamplingParams(
- temperature=0.0,
- # logprobs=1,
- prompt_logprobs=1,
- max_tokens=128,
- stop_token_ids=[32003]),
- LoRARequest(
- lora_name="l2-lora-test2",
- lora_int_id=2,
- lora_path=lora_path
- )),
- ("""[user] Write a SQL query to answer the question based on the table
- schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR,
- elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector
- what is under nationality? [/user] [assistant]""",
- SamplingParams(n=3,
- best_of=3,
- temperature=0.9,
- max_tokens=128,
- stop_token_ids=[32003]),
- LoRARequest(
- lora_name="l2-lora-test",
- lora_int_id=1,
- lora_path=lora_path
- )),
- ] # type: ignore
- async def process_requests(engine: AsyncAphrodite,
- test_prompts: List[Tuple[str, SamplingParams,
- Optional[LoRARequest]]]):
- """Continuously process a list of prompts and handle the outputs."""
- request_id = 0
- active_requests = []
- for prompt, sampling_params, lora_request in test_prompts:
- request_generator = engine.generate(
- prompt,
- sampling_params,
- str(request_id),
- lora_request=lora_request
- )
- active_requests.append(request_generator)
- request_id += 1
- # Process all requests
- for request_generator in active_requests:
- # Don't await the generator itself, just iterate over it
- async for request_output in request_generator:
- if request_output.finished:
- print(request_output)
- def initialize_engine() -> AsyncAphrodite:
- """Initialize the AsyncAphrodite."""
- # Function remains unchanged as it's just initialization
- engine_args = AsyncEngineArgs(model="NousResearch/Llama-2-7b-hf",
- enable_lora=True,
- max_loras=1,
- max_lora_rank=8,
- max_cpu_loras=2,
- max_num_seqs=256)
- return AsyncAphrodite.from_engine_args(engine_args)
- async def main():
- """Main function that sets up and runs the prompt processing."""
- engine = initialize_engine()
- test_prompts = create_test_prompts("alpindale/l2-lora-test")
- await process_requests(engine, test_prompts)
- if __name__ == '__main__':
- asyncio.run(main())
|