test_pynccl.py 3.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. import multiprocessing
  2. import os
  3. import pytest
  4. import torch
  5. from aphrodite.distributed.device_communicators.pynccl import (
  6. NCCLCommunicator, ncclGetUniqueId)
  7. def distributed_run(fn, world_size):
  8. number_of_processes = world_size
  9. processes = []
  10. for i in range(number_of_processes):
  11. env = os.environ.copy()
  12. env['RANK'] = str(i)
  13. env['LOCAL_RANK'] = str(i)
  14. env['WORLD_SIZE'] = str(number_of_processes)
  15. env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
  16. env['MASTER_ADDR'] = 'localhost'
  17. env['MASTER_PORT'] = '12345'
  18. p = multiprocessing.Process(target=fn, args=(env, ))
  19. processes.append(p)
  20. p.start()
  21. for p in processes:
  22. p.join()
  23. def update_env(fn):
  24. # `multiprocessing.Process` cannot accept environment variables directly
  25. # so we need to pass the environment variables as arguments
  26. # and update the environment variables in the function
  27. def wrapper(env):
  28. import os
  29. os.environ.update(env)
  30. fn()
  31. return wrapper
  32. @update_env
  33. def worker_fn():
  34. comm = NCCLCommunicator()
  35. tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(comm.rank)
  36. comm.all_reduce(tensor)
  37. result = tensor.mean().cpu().item()
  38. assert result == comm.world_size
  39. @pytest.mark.skipif(torch.cuda.device_count() < 2,
  40. reason="Need at least 2 GPUs to run the test.")
  41. def test_pynccl():
  42. distributed_run(worker_fn, 2)
  43. @update_env
  44. def worker_fn_with_cudagraph():
  45. with torch.no_grad():
  46. graph = torch.cuda.CUDAGraph()
  47. comm = NCCLCommunicator()
  48. # run something in the default stream to initialize torch engine
  49. a = torch.ones((4, 4), device=f'cuda:{comm.rank}')
  50. torch.cuda.synchronize()
  51. with torch.cuda.graph(graph, stream=comm.stream):
  52. # operation during the graph capture is recorded but not executed
  53. # see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#creating-a-graph-using-stream-capture # noqa
  54. comm.all_reduce(a)
  55. comm.stream.synchronize()
  56. assert a.mean().cpu().item() == comm.world_size**0
  57. graph.replay()
  58. comm.stream.synchronize()
  59. assert a.mean().cpu().item() == comm.world_size**1
  60. @pytest.mark.skipif(torch.cuda.device_count() < 2,
  61. reason="Need at least 2 GPUs to run the test.")
  62. def test_pynccl_with_cudagraph():
  63. distributed_run(worker_fn_with_cudagraph, 2)
  64. def test_ncclGetUniqueId():
  65. unique_id = ncclGetUniqueId()
  66. # `list(unique_id.internal)` is something like this:
  67. # [34, -16, 23, 83, 109, -19, 59, 95, 2, 0, -86, 55, 10, -128, 0, 29, 0,
  68. # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  69. # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  70. # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  71. # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  72. # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
  73. # as long as the function doesn't raise an exception, we're good
  74. assert unique_id is not None