فهرست منبع

distributed: fix issue for when nodes have multiple network interfaces (#892)

AlpinDale 2 ماه پیش
والد
کامیت
65b71f5fcc
2فایلهای تغییر یافته به همراه16 افزوده شده و 1 حذف شده
  1. 4 1
      aphrodite/common/envs.py
  2. 12 0
      aphrodite/executor/ray_gpu_executor.py

+ 4 - 1
aphrodite/common/envs.py

@@ -132,7 +132,10 @@ environment_variables: Dict[str, Callable[[], Any]] = {
             os.path.join(get_default_cache_root(), "aphrodite"),
         )),
 
-    # used in distributed environment to determine the master address
+    # used in distributed environment to determine the ip address
+    # of the current node, when the node has multiple network interfaces.
+    # If you are using multi-node inference, you should set this differently
+    # on each node.
     'APHRODITE_HOST_IP':
     lambda: os.getenv('APHRODITE_HOST_IP', "") or os.getenv("HOST_IP", ""),
 

+ 12 - 0
aphrodite/executor/ray_gpu_executor.py

@@ -226,6 +226,18 @@ class RayGPUExecutor(DistributedGPUExecutor):
         for node_id, gpu_ids in node_gpus.items():
             node_gpus[node_id] = sorted(gpu_ids)
 
+        all_ips = set(worker_ips + [driver_ip])
+        n_ips = len(all_ips)
+        n_nodes = len(node_workers)
+        if n_nodes != n_ips:
+            raise RuntimeError(
+                f"Every node should have a unique IP address. Got {n_nodes}"
+                f" nodes with node ids {list(node_workers.keys())} and "
+                f"{n_ips} unique IP addresses {all_ips}. Please check your"
+                " network configuration. If you set `APHRODITE_HOST_IP` or "
+                "`HOST_IP` environment variable, make sure it is unique for"
+                " each node.")
+
         APHRODITE_INSTANCE_ID = get_aphrodite_instance_id()
 
         # Set environment variables for the driver and workers.