neuron_inference.py 1.3 KB

123456789101112131415161718192021222324252627282930313233343536
  1. from aphrodite import LLM, SamplingParams
  2. # Sample prompts.
  3. prompts = [
  4. "Once upon a time,",
  5. "In a galaxy far, far away,",
  6. "The quick brown fox jumps over the lazy dog.",
  7. "The meaning of life is",
  8. ]
  9. # Create a sampling params object.
  10. sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
  11. # Create an LLM.
  12. llm = LLM(
  13. model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
  14. max_num_seqs=8,
  15. # The max_model_len and block_size arguments are required to be same as
  16. # max sequence length when targeting neuron device.
  17. # Currently, this is a known limitation in continuous batching support
  18. # in transformers-neuronx.
  19. # TODO: Support paged-attention in transformers-neuronx.
  20. max_model_len=128,
  21. block_size=128,
  22. # The device can be automatically detected when AWS Neuron SDK is installed.
  23. # The device argument can be either unspecified for automated detection,
  24. # or explicitly assigned.
  25. device="neuron",
  26. tensor_parallel_size=2)
  27. # Generate texts from the prompts. The output is a list of RequestOutput objects
  28. # that contain the prompt, generated text, and other information.
  29. outputs = llm.generate(prompts, sampling_params)
  30. # Print the outputs.
  31. for output in outputs:
  32. prompt = output.prompt
  33. generated_text = output.outputs[0].text
  34. print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")