123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214 |
- # ruff: noqa: SIM117
- import os
- from pathlib import Path
- from typing import List, Optional, Tuple
- import openvino as ov
- import torch
- from huggingface_hub import HfApi
- from loguru import logger
- from openvino._offline_transformations import paged_attention_transformation
- from optimum.intel import OVModelForCausalLM
- from torch import nn
- from aphrodite.attention.backends.openvino import OpenVINOAttentionMetadata
- from aphrodite.common.config import DeviceConfig, ModelConfig
- from aphrodite.common.sequence import SamplerOutput
- from aphrodite.modeling.layers.logits_processor import (LogitsProcessor,
- _prune_hidden_states)
- from aphrodite.modeling.layers.sampler import Sampler
- from aphrodite.modeling.sampling_metadata import SamplingMetadata
- APHRODITE_OPENVINO_ENABLE_QUANTIZED_WEIGHTS = bool(
- os.getenv("APHRODITE_OPENVINO_ENABLE_QUANTIZED_WEIGHTS", False))
- def _flattenize_inputs(inputs):
- """
- Helper function for making nested inputs flattens
- """
- flatten_inputs = []
- for input_data in inputs:
- if input_data is None:
- continue
- if isinstance(input_data, (list, tuple)):
- flatten_inputs.extend(_flattenize_inputs(input_data))
- elif isinstance(input_data, dict):
- flatten_inputs.extend(_flattenize_inputs(list(
- input_data.values())))
- else:
- flatten_inputs.append(input_data)
- return flatten_inputs
- def _modify_cache_parameters(model: ov.Model, kv_cache_dtype: ov.Type,
- is_cpu: bool):
- # Apply hardware dependent modifications to KV tensors
- for parameter in model.get_parameters():
- input = parameter.get_output_tensor(0)
- input_names = input.get_names()
- if len(input_names) != 1:
- continue
- input_name = next(iter(input_names))
- shape = parameter.get_partial_shape()
- # use real block size if available, just a placeholder
- # to provide the expected rank
- x_size = 1
- num_blocks = ov.Dimension()
- block_size = ov.Dimension()
- head_size = ov.Dimension()
- # TODO: Negotiate required layout with plugins (CPU is ~OK, GPU is TBD),
- # pass more parameters to this function to set more static dimensions
- if input_name.startswith("key_cache."):
- cpu_shape = [num_blocks, shape[1], block_size, head_size]
- gpu_shape = [
- num_blocks,
- shape[1],
- shape[2].get_length() //
- x_size if shape[2].is_static else ov.Dimension(),
- block_size,
- x_size,
- ]
- elif input_name.startswith("value_cache."):
- cpu_shape = [num_blocks, shape[1], block_size, head_size]
- gpu_shape = [num_blocks, shape[1], shape[2], block_size]
- else:
- continue
- parameter.set_partial_shape(
- ov.PartialShape(cpu_shape if is_cpu else gpu_shape))
- parameter.set_element_type(kv_cache_dtype)
- model.validate_nodes_and_infer_types()
- def _require_model_export(model_id, revision=None, subfolder=None):
- model_dir = Path(model_id)
- if subfolder is not None:
- model_dir = model_dir / subfolder
- if model_dir.is_dir():
- return (not (model_dir / "openvino_model.xml").exists()
- or not (model_dir / "openvino_model.bin").exists())
- hf_api = HfApi()
- try:
- model_info = hf_api.model_info(model_id, revision=revision or "main")
- normalized_subfolder = (None if subfolder is None else
- Path(subfolder).as_posix())
- model_files = [
- file.rfilename for file in model_info.siblings
- if normalized_subfolder is None
- or file.rfilename.startswith(normalized_subfolder)
- ]
- ov_model_path = ("openvino_model.xml" if normalized_subfolder is None
- else f"{normalized_subfolder}/openvino_model.xml")
- return (ov_model_path not in model_files
- or ov_model_path.replace(".xml", ".bin") not in model_files)
- except Exception:
- return True
- class OpenVINOCasualLM(nn.Module):
- def __init__(
- self,
- model_config: ModelConfig,
- device_config: DeviceConfig,
- kv_cache_dtype: ov.Type,
- ) -> None:
- super().__init__()
- self.logits_processor = LogitsProcessor(
- model_config.hf_config.vocab_size, logits_as_input=True)
- self.sampler = Sampler()
- export = _require_model_export(model_config.model)
- if export:
- logger.warning(
- f"Provided model id {model_config.model} does not " # noqa: G004
- "contain OpenVINO IR, the model will be converted to IR with "
- "default options. If you need to use specific options for "
- "model conversion, use optimum-cli export openvino with "
- "desired options.")
- else:
- logger.warning(
- "OpenVINO IR is available for provided model id " # noqa: G004
- f"{model_config.model}. This IR will be used for inference "
- "as-is, all possible options that may affect model conversion "
- "are ignored.")
- load_in_8bit = APHRODITE_OPENVINO_ENABLE_QUANTIZED_WEIGHTS
- pt_model = OVModelForCausalLM.from_pretrained(
- model_config.model,
- export=export,
- compile=False,
- load_in_8bit=load_in_8bit,
- trust_remote_code=model_config.trust_remote_code,
- )
- paged_attention_transformation(pt_model.model)
- _modify_cache_parameters(pt_model.model, kv_cache_dtype,
- device_config.device.type == "cpu")
- core = ov.Core()
- ov_compiled = core.compile_model(pt_model.model, "CPU")
- self.ov_request = ov_compiled.create_infer_request()
- def forward(
- self,
- input_ids: torch.Tensor,
- positions: torch.Tensor,
- kv_caches: List[Tuple[ov.Tensor, ov.Tensor]],
- attn_metadata: OpenVINOAttentionMetadata,
- ) -> torch.Tensor:
- flatten_kv_cache = _flattenize_inputs(kv_caches)
- inputs = [
- input_ids,
- positions,
- *flatten_kv_cache,
- attn_metadata.past_lens,
- attn_metadata.subsequence_begins,
- attn_metadata.block_indices,
- attn_metadata.block_indices_begins,
- attn_metadata.max_context_len,
- ]
- self.ov_request.start_async(inputs, share_inputs=True)
- self.ov_request.wait()
- logits = torch.from_numpy(self.ov_request.get_tensor("logits").data)
- # TODO: remove 'view' once OpenVINO PA will drop 'seq_len' dimension
- return logits.view(-1, logits.shape[-1])
- def compute_logits(
- self,
- hidden_states: torch.Tensor,
- sampling_metadata: SamplingMetadata,
- ) -> Optional[torch.Tensor]:
- hidden_states = _prune_hidden_states(hidden_states, sampling_metadata)
- logits = self.logits_processor(None, hidden_states, sampling_metadata)
- return logits
- def sample(
- self,
- logits: torch.Tensor,
- sampling_metadata: SamplingMetadata,
- ) -> Optional[SamplerOutput]:
- next_tokens = self.sampler(logits, sampling_metadata)
- return next_tokens
- def get_model(
- model_config: ModelConfig,
- device_config: DeviceConfig,
- kv_cache_dtype: ov.Type,
- **kwargs,
- ) -> torch.nn.Module:
- lora_config = kwargs.get("lora_config", None)
- if lora_config:
- raise ValueError(
- "OpenVINO modeling does not support LoRA, "
- "but LoRA is enabled. Support for this model may "
- "be added in the future. If this is important to you, "
- "please open an issue on github.")
- return OpenVINOCasualLM(model_config, device_config, kv_cache_dtype)
|