ray_utils.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. from typing import List, Optional, Tuple, Union
  2. from aphrodite.common.config import ParallelConfig
  3. from aphrodite.common.sequence import ExecuteModelRequest, IntermediateTensors
  4. from aphrodite.common.utils import get_ip, is_hip, is_tpu, is_xpu
  5. from aphrodite.task_handler.worker_base import WorkerWrapperBase
  6. try:
  7. import ray
  8. class RayWorkerWrapper(WorkerWrapperBase):
  9. """Ray wrapper for aphrodite.task_handler.Worker, allowing Worker to be
  10. lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES."""
  11. def __init__(self, *args, **kwargs) -> None:
  12. super().__init__(*args, **kwargs)
  13. # Since the compiled DAG runs a main execution
  14. # in a different thread that calls cuda.set_device.
  15. # The flag indicates is set_device is called on
  16. # that thread.
  17. self.compiled_dag_cuda_device_set = False
  18. def get_node_ip(self) -> str:
  19. return get_ip()
  20. def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]:
  21. node_id = ray.get_runtime_context().get_node_id()
  22. gpu_ids = ray.get_gpu_ids()
  23. return node_id, gpu_ids
  24. def execute_model_spmd(
  25. self, req_or_tuple: Union[ExecuteModelRequest,
  26. Tuple[ExecuteModelRequest,
  27. IntermediateTensors]]):
  28. """Execute model in SPMD fashion: used only when SPMD worker and
  29. compiled DAG are both enabled.
  30. Args:
  31. req_or_tuple: The request to execute the model, or a tuple
  32. containing the request and intermediate tensors.
  33. """
  34. # TODO: This is needed right now because Ray DAG executes
  35. # on a background thread, so we need to reset torch's current
  36. # device.
  37. import torch
  38. if not self.compiled_dag_cuda_device_set:
  39. torch.cuda.set_device(self.worker.device)
  40. self.compiled_dag_cuda_device_set = True
  41. if isinstance(req_or_tuple, tuple):
  42. execute_model_req, intermediate_tensors = req_or_tuple
  43. else:
  44. execute_model_req = req_or_tuple
  45. intermediate_tensors = None
  46. output = self.worker._execute_model_spmd(execute_model_req,
  47. intermediate_tensors)
  48. if isinstance(output, IntermediateTensors):
  49. return execute_model_req, output
  50. return output
  51. ray_import_err = None
  52. except ImportError as e:
  53. ray = None # type: ignore
  54. ray_import_err = e
  55. RayWorkerWrapper = None # type: ignore
  56. def ray_is_available() -> bool:
  57. """Returns True if Ray is available."""
  58. return ray is not None
  59. def assert_ray_available():
  60. """Raise an exception if Ray is not available."""
  61. if ray is None:
  62. raise ValueError("Failed to import Ray, please install Ray with "
  63. "`pip install ray`.") from ray_import_err
  64. def initialize_ray_cluster(
  65. parallel_config: ParallelConfig,
  66. ray_address: Optional[str] = None,
  67. ):
  68. """Initialize the distributed cluster with Ray.
  69. it will connect to the Ray cluster and create a placement group
  70. for the workers, which includes the specification of the resources
  71. for each distributed worker.
  72. Args:
  73. parallel_config: The configurations for parallel execution.
  74. ray_address: The address of the Ray cluster. If None, uses
  75. the default Ray cluster address.
  76. """
  77. assert_ray_available()
  78. # Connect to a ray cluster.
  79. if is_hip() or is_xpu():
  80. ray.init(address=ray_address,
  81. ignore_reinit_error=True,
  82. num_gpus=parallel_config.world_size)
  83. else:
  84. ray.init(address=ray_address, ignore_reinit_error=True)
  85. if parallel_config.placement_group:
  86. # Placement group is already set.
  87. return
  88. device_str = "GPU" if not is_tpu() else "TPU"
  89. # Create placement group for worker processes
  90. current_placement_group = ray.util.get_current_placement_group()
  91. if current_placement_group:
  92. # We are in a placement group
  93. bundles = current_placement_group.bundle_specs
  94. # Verify that we can use the placement group.
  95. device_bundles = 0
  96. for bundle in bundles:
  97. bundle_devices = bundle.get(device_str, 0)
  98. if bundle_devices > 1:
  99. raise ValueError(
  100. "Placement group bundle cannot have more than 1 "
  101. f"{device_str}.")
  102. if bundle_devices:
  103. device_bundles += 1
  104. if parallel_config.world_size > device_bundles:
  105. raise ValueError(
  106. f"The number of required {device_str}s exceeds the total "
  107. f"number of available {device_str}s in the placement group."
  108. f"Required number of devices: {parallel_config.world_size}. "
  109. f"Total number of devices: {device_bundles}.")
  110. else:
  111. num_devices_in_cluster = ray.cluster_resources().get(device_str, 0)
  112. if parallel_config.world_size > num_devices_in_cluster:
  113. raise ValueError(
  114. f"The number of required {device_str}s exceeds the total "
  115. f"number of available {device_str}s in the placement group.")
  116. # Create a new placement group
  117. placement_group_specs = ([{
  118. device_str: 1
  119. }] * parallel_config.world_size)
  120. current_placement_group = ray.util.placement_group(
  121. placement_group_specs)
  122. # Wait until PG is ready - this will block until all
  123. # requested resources are available, and will timeout
  124. # if they cannot be provisioned.
  125. ray.get(current_placement_group.ready(), timeout=1800)
  126. # Set the placement group in the parallel config
  127. parallel_config.placement_group = current_placement_group