ray_utils.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. from typing import List, Optional, Tuple, Union
  2. import msgspec
  3. from aphrodite.common.config import ParallelConfig
  4. from aphrodite.common.sequence import ExecuteModelRequest, IntermediateTensors
  5. from aphrodite.common.utils import get_ip, is_hip, is_xpu
  6. from aphrodite.executor.msgspec_utils import decode_hook, encode_hook
  7. from aphrodite.platforms import current_platform
  8. from aphrodite.task_handler.worker_base import WorkerWrapperBase
  9. try:
  10. import ray
  11. class RayWorkerWrapper(WorkerWrapperBase):
  12. """Ray wrapper for aphrodite.task_handler.Worker, allowing Worker to be
  13. lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES."""
  14. def __init__(self, *args, **kwargs) -> None:
  15. super().__init__(*args, **kwargs)
  16. # Since the compiled DAG runs a main execution
  17. # in a different thread that calls cuda.set_device.
  18. # The flag indicates is set_device is called on
  19. # that thread.
  20. self.compiled_dag_cuda_device_set = False
  21. self.input_decoder = msgspec.msgpack.Decoder(ExecuteModelRequest,
  22. dec_hook=decode_hook)
  23. self.output_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
  24. def get_node_ip(self) -> str:
  25. return get_ip()
  26. def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]:
  27. node_id = ray.get_runtime_context().get_node_id()
  28. gpu_ids = ray.get_gpu_ids()
  29. return node_id, gpu_ids
  30. def execute_model_spmd(
  31. self, req_or_tuple: Union[bytes,
  32. Tuple[bytes,
  33. Optional[IntermediateTensors]]]
  34. ) -> bytes:
  35. """Execute model in SPMD fashion: used only when SPMD worker and
  36. compiled DAG are both enabled.
  37. Args:
  38. req_or_tuple: A request or a tuple containing the
  39. request and intermediate tensors. Intermediate tensors are
  40. None unless if it is provided because it is > 0 pipeline
  41. stage. The request is serialized by msgspec.
  42. """
  43. if isinstance(req_or_tuple, bytes):
  44. serialized_req, intermediate_tensors = req_or_tuple, None
  45. else:
  46. serialized_req, intermediate_tensors = req_or_tuple
  47. execute_model_req = self.input_decoder.decode(serialized_req)
  48. # TODO: This is needed right now because Ray DAG executes
  49. # on a background thread, so we need to reset torch's current
  50. # device.
  51. import torch
  52. if not self.compiled_dag_cuda_device_set:
  53. torch.cuda.set_device(self.worker.device)
  54. self.compiled_dag_cuda_device_set = True
  55. output = self.worker._execute_model_spmd(execute_model_req,
  56. intermediate_tensors)
  57. # Pipeline model request and output to the next pipeline stage
  58. if isinstance(output, IntermediateTensors):
  59. output = serialized_req, output
  60. else:
  61. output = self.output_encoder.encode(output)
  62. return output
  63. ray_import_err = None
  64. except ImportError as e:
  65. ray = None # type: ignore
  66. ray_import_err = e
  67. RayWorkerWrapper = None # type: ignore
  68. def ray_is_available() -> bool:
  69. """Returns True if Ray is available."""
  70. return ray is not None
  71. def assert_ray_available():
  72. """Raise an exception if Ray is not available."""
  73. if ray is None:
  74. raise ValueError("Failed to import Ray, please install Ray with "
  75. "`pip install ray`.") from ray_import_err
  76. def initialize_ray_cluster(
  77. parallel_config: ParallelConfig,
  78. ray_address: Optional[str] = None,
  79. ):
  80. """Initialize the distributed cluster with Ray.
  81. it will connect to the Ray cluster and create a placement group
  82. for the workers, which includes the specification of the resources
  83. for each distributed worker.
  84. Args:
  85. parallel_config: The configurations for parallel execution.
  86. ray_address: The address of the Ray cluster. If None, uses
  87. the default Ray cluster address.
  88. """
  89. assert_ray_available()
  90. # Connect to a ray cluster.
  91. if is_hip() or is_xpu():
  92. ray.init(address=ray_address,
  93. ignore_reinit_error=True,
  94. num_gpus=parallel_config.world_size)
  95. else:
  96. ray.init(address=ray_address, ignore_reinit_error=True)
  97. if parallel_config.placement_group:
  98. # Placement group is already set.
  99. return
  100. device_str = "GPU" if not current_platform.is_tpu() else "TPU"
  101. # Create placement group for worker processes
  102. current_placement_group = ray.util.get_current_placement_group()
  103. if current_placement_group:
  104. # We are in a placement group
  105. bundles = current_placement_group.bundle_specs
  106. # Verify that we can use the placement group.
  107. device_bundles = 0
  108. for bundle in bundles:
  109. bundle_devices = bundle.get(device_str, 0)
  110. if bundle_devices > 1:
  111. raise ValueError(
  112. "Placement group bundle cannot have more than 1 "
  113. f"{device_str}.")
  114. if bundle_devices:
  115. device_bundles += 1
  116. if parallel_config.world_size > device_bundles:
  117. raise ValueError(
  118. f"The number of required {device_str}s exceeds the total "
  119. f"number of available {device_str}s in the placement group."
  120. f"Required number of devices: {parallel_config.world_size}. "
  121. f"Total number of devices: {device_bundles}.")
  122. else:
  123. num_devices_in_cluster = ray.cluster_resources().get(device_str, 0)
  124. if parallel_config.world_size > num_devices_in_cluster:
  125. raise ValueError(
  126. f"The number of required {device_str}s exceeds the total "
  127. f"number of available {device_str}s in the placement group.")
  128. # Create a new placement group
  129. placement_group_specs = ([{
  130. device_str: 1
  131. }] * parallel_config.world_size)
  132. current_placement_group = ray.util.placement_group(
  133. placement_group_specs)
  134. # Wait until PG is ready - this will block until all
  135. # requested resources are available, and will timeout
  136. # if they cannot be provisioned.
  137. ray.get(current_placement_group.ready(), timeout=1800)
  138. # Set the placement group in the parallel config
  139. parallel_config.placement_group = current_placement_group