run_cluster.sh 1.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849
  1. #!/bin/bash
  2. # Check for minimum number of required arguments
  3. if [ $# -lt 4 ]; then
  4. echo "Usage: $0 docker_image head_node_address --head|--worker path_to_hf_home [additional_args...]"
  5. exit 1
  6. fi
  7. # Assign the first three arguments and shift them away
  8. DOCKER_IMAGE="$1"
  9. HEAD_NODE_ADDRESS="$2"
  10. NODE_TYPE="$3" # Should be --head or --worker
  11. PATH_TO_HF_HOME="$4"
  12. shift 4
  13. # Additional arguments are passed directly to the Docker command
  14. ADDITIONAL_ARGS="$@"
  15. # Validate node type
  16. if [ "${NODE_TYPE}" != "--head" ] && [ "${NODE_TYPE}" != "--worker" ]; then
  17. echo "Error: Node type must be --head or --worker"
  18. exit 1
  19. fi
  20. # Define a function to cleanup on EXIT signal
  21. cleanup() {
  22. docker stop node
  23. docker rm node
  24. }
  25. trap cleanup EXIT
  26. # Command setup for head or worker node
  27. RAY_START_CMD="ray start --block"
  28. if [ "${NODE_TYPE}" == "--head" ]; then
  29. RAY_START_CMD+=" --head --port=6379"
  30. else
  31. RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379"
  32. fi
  33. # Run the docker command with the user specified parameters and additional arguments
  34. docker run \
  35. --entrypoint /bin/bash \
  36. --network host \
  37. --name node \
  38. --shm-size 10.24g \
  39. --gpus all \
  40. -v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \
  41. ${ADDITIONAL_ARGS} \
  42. "${DOCKER_IMAGE}" -c "${RAY_START_CMD}"