neuron_executor.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. from typing import Dict, List, Optional
  2. from aphrodite.lora.request import LoRARequest
  3. from aphrodite.common.config import (CacheConfig, DeviceConfig, ModelConfig,
  4. ParallelConfig, SchedulerConfig,
  5. LoRAConfig)
  6. from aphrodite.executor.executor_base import ExecutorBase
  7. from aphrodite.common.sequence import SamplerOutput, SequenceGroupMetadata
  8. class NeuronExecutor(ExecutorBase):
  9. def __init__(
  10. self,
  11. model_config: ModelConfig,
  12. cache_config: CacheConfig,
  13. parallel_config: ParallelConfig,
  14. scheduler_config: SchedulerConfig,
  15. device_config: DeviceConfig,
  16. lora_config: Optional[LoRAConfig],
  17. ) -> None:
  18. self.model_config = model_config
  19. self.cache_config = cache_config
  20. assert lora_config is None, "LoRA is not supported for Neuron backend."
  21. self.parallel_config = parallel_config
  22. self.scheduler_config = scheduler_config
  23. self.device_config = device_config
  24. # Set the number of GPU blocks to be the same as the maximum number of
  25. # sequences that can be processed in a single batch. This is equivalent
  26. # to schedule without PagedAttention.
  27. self.cache_config.num_gpu_blocks = self.scheduler_config.max_num_seqs
  28. self.cache_config.num_cpu_blocks = 0
  29. # Instantiate the worker and load the model to the device.
  30. self._init_worker()
  31. def _init_worker(self):
  32. from aphrodite.task_handler.neuron_worker import NeuronWorker
  33. self.driver_worker = NeuronWorker(
  34. self.model_config,
  35. self.parallel_config,
  36. self.scheduler_config,
  37. self.device_config,
  38. )
  39. self.driver_worker.init_device()
  40. self.driver_worker.load_model()
  41. def execute_model(self,
  42. seq_group_metadata_list: List[SequenceGroupMetadata],
  43. blocks_to_swap_in: Dict[int, int],
  44. blocks_to_swap_out: Dict[int, int],
  45. blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput:
  46. assert (blocks_to_swap_in == {} and blocks_to_swap_out == {}
  47. and blocks_to_copy == {}), (
  48. "Cache operations are not supported for Neuron backend.")
  49. output = self.driver_worker.execute_model(
  50. seq_group_metadata_list=seq_group_metadata_list)
  51. return output
  52. def add_lora(self, lora_request: LoRARequest) -> bool:
  53. raise NotImplementedError(
  54. "LoRA is not implemented for neuron backend.")
  55. def remove_lora(self, lora_id: int) -> bool:
  56. raise NotImplementedError(
  57. "LoRA is not implemented for neuron backend.")
  58. def list_loras(self) -> List[int]:
  59. raise NotImplementedError(
  60. "LoRA is not implemented for neuron backend.")
  61. def check_health(self) -> None:
  62. # NeuronExecutor will always be healthy as long as
  63. # it's running.
  64. return