Dockerfile 3.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. # Inspired by https://github.com/anibali/docker-pytorch/blob/master/dockerfiles/1.10.0-cuda11.3-ubuntu20.04/Dockerfile
  2. # ARG COMPAT=0
  3. ARG PERSONAL=0
  4. # FROM nvidia/cuda:11.3.1-devel-ubuntu20.04 as base-0
  5. FROM nvcr.io/nvidia/pytorch:22.12-py3 as base
  6. ENV HOST docker
  7. ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
  8. # https://serverfault.com/questions/683605/docker-container-time-timezone-will-not-reflect-changes
  9. ENV TZ America/Los_Angeles
  10. RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
  11. # git for installing dependencies
  12. # tzdata to set time zone
  13. # wget and unzip to download data
  14. # [2021-09-09] TD: zsh, stow, subversion, fasd are for setting up my personal environment.
  15. # [2021-12-07] TD: openmpi-bin for MPI (multi-node training)
  16. RUN apt-get update && apt-get install -y --no-install-recommends \
  17. build-essential \
  18. cmake \
  19. curl \
  20. ca-certificates \
  21. sudo \
  22. less \
  23. htop \
  24. git \
  25. tzdata \
  26. wget \
  27. tmux \
  28. zip \
  29. unzip \
  30. zsh stow subversion fasd \
  31. && rm -rf /var/lib/apt/lists/*
  32. # openmpi-bin \
  33. # Allow running runmpi as root
  34. # ENV OMPI_ALLOW_RUN_AS_ROOT=1 OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
  35. # # Create a non-root user and switch to it
  36. # RUN adduser --disabled-password --gecos '' --shell /bin/bash user \
  37. # && echo "user ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/90-user
  38. # USER user
  39. # All users can use /home/user as their home directory
  40. ENV HOME=/home/user
  41. RUN mkdir -p /home/user && chmod 777 /home/user
  42. WORKDIR /home/user
  43. # Set up personal environment
  44. # FROM base-${COMPAT} as env-0
  45. FROM base as env-0
  46. FROM env-0 as env-1
  47. # Use ONBUILD so that the dotfiles dir doesn't need to exist unless we're building a personal image
  48. # https://stackoverflow.com/questions/31528384/conditional-copy-add-in-dockerfile
  49. ONBUILD COPY dotfiles ./dotfiles
  50. ONBUILD RUN cd ~/dotfiles && stow bash zsh tmux && sudo chsh -s /usr/bin/zsh $(whoami)
  51. # nvcr pytorch image sets SHELL=/bin/bash
  52. ONBUILD ENV SHELL=/bin/zsh
  53. FROM env-${PERSONAL} as packages
  54. # Disable pip cache: https://stackoverflow.com/questions/45594707/what-is-pips-no-cache-dir-good-for
  55. ENV PIP_NO_CACHE_DIR=1
  56. # # apex and pytorch-fast-transformers take a while to compile so we install them first
  57. # TD [2022-04-28] apex is already installed. In case we need a newer commit:
  58. # RUN pip install --upgrade --force-reinstall --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_multihead_attn" --global-option="--fmha" --global-option="--fast_layer_norm" --global-option="--xentropy" git+https://github.com/NVIDIA/apex.git#egg=apex
  59. # xgboost conflicts with deepspeed
  60. RUN pip uninstall -y xgboost && DS_BUILD_UTILS=1 DS_BUILD_FUSED_LAMB=1 pip install deepspeed==0.7.7
  61. # General packages that we don't care about the version
  62. # zstandard to extract the_pile dataset
  63. # psutil to get the number of cpu physical cores
  64. # twine to upload package to PyPI
  65. RUN pip install pytest matplotlib jupyter ipython ipdb gpustat scikit-learn spacy munch einops opt_einsum fvcore gsutil cmake pykeops zstandard psutil h5py twine gdown \
  66. && python -m spacy download en_core_web_sm
  67. # hydra
  68. RUN pip install hydra-core==1.3.1 hydra-colorlog==1.2.0 hydra-optuna-sweeper==1.2.0 pyrootutils rich
  69. # Core packages
  70. RUN pip install transformers==4.25.1 datasets==2.8.0 pytorch-lightning==1.8.6 triton==2.0.0.dev20221202 wandb==0.13.7 timm==0.6.12 torchmetrics==0.10.3
  71. # torchmetrics 0.11.0 broke hydra's instantiate
  72. # For MLPerf
  73. RUN pip install git+https://github.com/mlcommons/logging.git@2.1.0
  74. # Install FlashAttention
  75. RUN pip install flash-attn==2.6.3
  76. # Install CUDA extensions for fused dense
  77. RUN pip install git+https://github.com/Dao-AILab/flash-attention@v2.6.3#subdirectory=csrc/fused_dense_lib