neuron.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
  1. """Utilities for selecting and loading neuron models."""
  2. import importlib
  3. import os
  4. from typing import Dict, Optional, Tuple
  5. import torch
  6. import torch.nn as nn
  7. import transformers
  8. from transformers import PretrainedConfig
  9. from aphrodite.common.config import (ModelConfig, ParallelConfig,
  10. SchedulerConfig)
  11. from aphrodite.common.sequence import SamplerOutput
  12. from aphrodite.modeling.layers.logits_processor import LogitsProcessor
  13. from aphrodite.modeling.layers.sampler import Sampler
  14. from aphrodite.modeling.sampling_metadata import SamplingMetadata
  15. TORCH_DTYPE_TO_NEURON_AMP = {
  16. "auto": "f32",
  17. "half": "f16",
  18. "float16": "f16",
  19. "bfloat16": "bf16",
  20. "float": "f32",
  21. "float32": "f32",
  22. torch.float16: "f16",
  23. torch.bfloat16: "bf16",
  24. torch.float32: "f32",
  25. }
  26. # Models supported by Neuron.
  27. _NEURON_SUPPORTED_MODELS: Dict[str, Tuple[str, str, str]] = {
  28. "LlamaForCausalLM": ("transformers_neuronx.llama.model",
  29. "LlamaForSampling", "LlamaForCausalLM"),
  30. "MistralForCausalLM": ("transformers_neuronx.mistral.model",
  31. "MistralForSampling", "MistralForCausalLM")
  32. }
  33. class NeuronCasualLM(nn.Module):
  34. def __init__(
  35. self,
  36. config: PretrainedConfig,
  37. ) -> None:
  38. super().__init__()
  39. self.config = config
  40. self.logits_processor = LogitsProcessor(config.vocab_size,
  41. logits_as_input=True)
  42. self.sampler = Sampler()
  43. # Lazy initialized
  44. self.model: nn.Module
  45. def forward(
  46. self,
  47. input_ids: torch.Tensor,
  48. positions: torch.Tensor,
  49. input_block_ids: torch.Tensor,
  50. ) -> torch.Tensor:
  51. logits = self.model(input_ids,
  52. cache_ids=positions,
  53. start_ids=input_block_ids)
  54. return logits
  55. def compute_logits(
  56. self,
  57. hidden_states: torch.Tensor,
  58. sampling_metadata: SamplingMetadata,
  59. ) -> Optional[torch.Tensor]:
  60. logits = self.logits_processor(None, hidden_states, sampling_metadata)
  61. return logits
  62. def sample(
  63. self,
  64. logits: torch.Tensor,
  65. sampling_metadata: SamplingMetadata,
  66. ) -> Optional[SamplerOutput]:
  67. next_tokens = self.sampler(logits, sampling_metadata)
  68. return next_tokens
  69. def load_weights(self, model_name_or_path: str, **kwargs):
  70. arch = _get_model_architecture(self.config)
  71. neuronx_module_path, neuronx_model_cls_name, hf_model_cls_name = (
  72. _NEURON_SUPPORTED_MODELS[arch])
  73. neuronx_module = importlib.import_module(neuronx_module_path)
  74. neuronx_model_cls = getattr(neuronx_module, neuronx_model_cls_name)
  75. split_model_dir = f"{model_name_or_path}-split"
  76. if os.path.isdir(os.path.join(model_name_or_path,
  77. "pytorch_model.bin")):
  78. split_model_dir = model_name_or_path
  79. elif not os.path.exists(f"{model_name_or_path}-split"):
  80. hf_model_cls = getattr(transformers, hf_model_cls_name)
  81. from transformers_neuronx.module import save_pretrained_split
  82. hf_model = hf_model_cls.from_pretrained(model_name_or_path,
  83. low_cpu_mem_usage=True)
  84. save_pretrained_split(hf_model, f"{model_name_or_path}-split")
  85. self.model = neuronx_model_cls.from_pretrained(split_model_dir,
  86. **kwargs)
  87. self.model.to_neuron()
  88. def _get_model_architecture(config: PretrainedConfig) -> str:
  89. architectures = getattr(config, "architectures", [])
  90. for arch in architectures:
  91. if arch in _NEURON_SUPPORTED_MODELS:
  92. return arch
  93. raise ValueError(
  94. f"Model architectures {architectures} are not supported on Neuron "
  95. f"for now. Supported architectures: "
  96. f"{list(_NEURON_SUPPORTED_MODELS.keys())}")
  97. def get_neuron_model(model_config: ModelConfig,
  98. parallel_config: ParallelConfig,
  99. scheduler_config: SchedulerConfig) -> nn.Module:
  100. from transformers_neuronx.config import (ContinuousBatchingConfig,
  101. NeuronConfig)
  102. # Create a model instance.
  103. model = NeuronCasualLM(model_config.hf_config)
  104. continuous_batching_config = ContinuousBatchingConfig(
  105. batch_size_for_shared_caches=scheduler_config.max_num_seqs)
  106. neuron_config = NeuronConfig(
  107. continuous_batching=continuous_batching_config)
  108. # Load the weights from the cached or downloaded files.
  109. model.load_weights(
  110. model_config.model,
  111. tp_degree=parallel_config.tensor_parallel_size,
  112. amp=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
  113. neuron_config=neuron_config,
  114. context_length_estimate=[scheduler_config.max_model_len],
  115. n_positions=[scheduler_config.max_model_len],
  116. batch_size=scheduler_config.max_num_seqs)
  117. return model.eval()