""" This example shows how to use Aphrodite for running offline inference with the correct prompt format on vision language models. For most models, the prompt format should follow corresponding examples on HuggingFace model repository. """ import os import cv2 import numpy as np from PIL import Image from transformers import AutoTokenizer from aphrodite import LLM, SamplingParams from aphrodite.assets.video import VideoAsset from aphrodite.common.utils import FlexibleArgumentParser from aphrodite.multimodal.utils import sample_frames_from_video # Input image and question image_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "burg.jpg") image = Image.open(image_path).convert("RGB") img_question = "What is the content of this image?" # Input video and question video_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "nadeko.mp4") vid_question = "What's in this video?" def load_video_frames(video_path: str, num_frames: int) -> np.ndarray: """ Load video frames from a local file path. Args: video_path: Path to the video file num_frames: Number of frames to sample from the video Returns: np.ndarray: Array of sampled video frames """ cap = cv2.VideoCapture(video_path) if not cap.isOpened(): raise ValueError(f"Could not open video file {video_path}") frames = [] while True: ret, frame = cap.read() if not ret: break frames.append(frame) cap.release() frames = np.stack(frames) return sample_frames_from_video(frames, num_frames) # LLaVA-1.5 def run_llava(question): prompt = f"USER: \n{question}\nASSISTANT:" llm = LLM(model="llava-hf/llava-1.5-7b-hf") stop_token_ids = None return llm, prompt, stop_token_ids # LLaVA-1.6/LLaVA-NeXT def run_llava_next(question): prompt = f"[INST] \n{question} [/INST]" llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192) stop_token_ids = None return llm, prompt, stop_token_ids # LlaVA-NeXT-Video # Currently only support for video input def run_llava_next_video(question): prompt = f"USER: