@@ -1,164 +1,26 @@
+import contextlib
import io
import os
import re
import subprocess
from typing import List, Set
-import sys
-from typing import List
+import warnings
+from pathlib import Path
from packaging.version import parse, Version
-from setuptools import setup, find_packages, Extension
-from setuptools.command.build_ext import build_ext
-from shutil import which
+import setuptools
import torch
-from torch.utils.cpp_extension import CUDA_HOME
+import torch.utils.cpp_extension as torch_cpp_ext
+from torch.utils.cpp_extension import (BuildExtension, CUDAExtension,
ROOT_DIR = os.path.dirname(__file__)
-def is_sccache_available() -> bool:
- return which("sccache") is not None
-def is_ccache_available() -> bool:
- return which("ccache") is not None
-def is_ninja_available() -> bool:
- return which("ninja") is not None
-def remove_prefix(text, prefix):
- if text.startswith(prefix):
- return text[len(prefix):]
- return text
-class CMakeExtension(Extension):
- def __init__(self, name: str, cmake_lists_dir: str = '.', **kwa) -> None:
- super().__init__(name, sources=[], **kwa)
- self.cmake_lists_dir = os.path.abspath(cmake_lists_dir)
-class cmake_build_ext(build_ext):
- # A dict of extension directories that have been configured.
- did_config = {}
- # Determine number of compilation jobs and optionally nvcc compile threads.
- def compute_num_jobs(self):
- try:
- # os.sched_getaffinity() isn't universally available, so fall back
- # to os.cpu_count() if we get an error here.
- num_jobs = len(os.sched_getaffinity(0))
- except AttributeError:
- num_jobs = os.cpu_count()
- nvcc_cuda_version = get_nvcc_cuda_version()
- if nvcc_cuda_version >= Version("11.2"):
- nvcc_threads = int(os.getenv("NVCC_THREADS", 8))
- num_jobs = max(1, round(num_jobs / (nvcc_threads / 4)))
- else:
- nvcc_threads = None
- return num_jobs, nvcc_threads
- # Perform cmake configuration for a single extension.
- def configure(self, ext: CMakeExtension) -> None:
- # If we've already configured using the CMakeLists.txt for
- # this extension, exit early.
- if ext.cmake_lists_dir in cmake_build_ext.did_config:
- return
- cmake_build_ext.did_config[ext.cmake_lists_dir] = True
- # Select the build type.
- # Note: optimization level + debug info are set by the build type
- default_cfg = "Debug" if self.debug else "RelWithDebInfo"
- cfg = os.getenv("CMAKE_BUILD_TYPE", default_cfg)
- # where .so files will be written, should be the same for all extensions
- # that use the same CMakeLists.txt.
- outdir = os.path.abspath(
- os.path.dirname(self.get_ext_fullpath(ext.name)))
- cmake_args = [
- '-DCMAKE_BUILD_TYPE={}'.format(cfg),
- '-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY={}'.format(self.build_temp),
- ]
- verbose = bool(int(os.getenv('VERBOSE', '0')))
- if verbose:
- cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON']
- if is_sccache_available():
- cmake_args += [
- ]
- elif is_ccache_available():
- cmake_args += [
- ]
- # Pass the python executable to cmake so it can find an exact
- # match.
- cmake_args += ['-DAPHRODITE_PYTHON_EXECUTABLE={}'.format(sys.executable)]
- if _install_punica():
- if _install_hadamard():
- #
- # Setup parallelism and build tool
- #
- num_jobs, nvcc_threads = self.compute_num_jobs()
- if nvcc_threads:
- cmake_args += ['-DNVCC_THREADS={}'.format(nvcc_threads)]
- if is_ninja_available():
- build_tool = ['-G', 'Ninja']
- cmake_args += [
- '-DCMAKE_JOB_POOLS:STRING=compile={}'.format(num_jobs),
- ]
- else:
- # Default build tool to whatever cmake picks.
- build_tool = []
- subprocess.check_call(
- ['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args],
- cwd=self.build_temp)
- def build_extensions(self) -> None:
- # Ensure that CMake is present and working
- try:
- subprocess.check_output(['cmake', '--version'])
- except OSError as e:
- raise RuntimeError('Cannot find CMake executable') from e
- # Create build directory if it does not exist.
- if not os.path.exists(self.build_temp):
- os.makedirs(self.build_temp)
- # Build all the extensions
- for ext in self.extensions:
- self.configure(ext)
- ext_target_name = remove_prefix(ext.name, "aphrodite.")
- num_jobs, _ = self.compute_num_jobs()
- build_args = [
- '--build', '.', '--target', ext_target_name, '-j',
- str(num_jobs)
- ]
- subprocess.check_call(['cmake', *build_args], cwd=self.build_temp)
+# Supported NVIDIA GPU architectures.
+NVIDIA_SUPPORTED_ARCHS = {"6.1", "7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
+ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx942", "gfx1100"}
def _is_hip() -> bool:
@@ -168,25 +30,26 @@ def _is_hip() -> bool:
def _is_cuda() -> bool:
return torch.version.cuda is not None
-def _install_punica() -> bool:
- return bool(int(os.getenv("APHRODITE_INSTALL_PUNICA_KERNELS", "0")))
-def _install_hadamard() -> bool:
- return bool(int(os.getenv("APHRODITE_INSTALL_HADAMARD_KERNELS", "0")))
+# Compiler flags.
+CXX_FLAGS = ["-g", "-O2", "-std=c++17"]
+# TODO: Should we use -O3?
+NVCC_FLAGS = ["-O2", "-std=c++17"]
-def get_path(*filepath) -> str:
- return os.path.join(ROOT_DIR, *filepath)
+if _is_hip():
+ if ROCM_HOME is None:
+ raise RuntimeError(
+ "Cannot find ROCM_HOME. ROCm must be available to build the "
+ "package.")
-def find_version(filepath: str) -> str:
- """Extract version information from the given filepath.
- Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py
- """
- with open(filepath) as fp:
- version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
- fp.read(), re.M)
- if version_match:
- return version_match.group(1)
- raise RuntimeError("Unable to find version string.")
+if _is_cuda() and CUDA_HOME is None:
+ raise RuntimeError(
+ "Cannot find CUDA_HOME. CUDA must be available to build the package.")
+ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0
def get_hipcc_rocm_version():
@@ -211,12 +74,17 @@ def get_hipcc_rocm_version():
return None
-def get_nvcc_cuda_version() -> Version:
+def glob(pattern: str):
+ root = Path(__name__).parent
+ return [str(p) for p in root.glob(pattern)]
+def get_nvcc_cuda_version(cuda_dir: str) -> Version:
"""Get the CUDA version from nvcc.
Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
- nvcc_output = subprocess.check_output([CUDA_HOME + "/bin/nvcc", "-V"],
+ nvcc_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
output = nvcc_output.split()
release_idx = output.index("release") + 1
@@ -224,6 +92,300 @@ def get_nvcc_cuda_version() -> Version:
return nvcc_cuda_version
+def get_pytorch_rocm_arch() -> Set[str]:
+ env_arch_list = os.environ.get("PYTORCH_ROCM_ARCH", None)
+ # If we don't have PYTORCH_ROCM_ARCH specified pull the list from
+ # rocm_agent_enumerator
+ if env_arch_list is None:
+ command = "rocm_agent_enumerator"
+ env_arch_list = subprocess.check_output([command]).decode('utf-8')\
+ .strip().replace("\n", ";")
+ arch_source_str = "rocm_agent_enumerator"
+ else:
+ arch_source_str = "PYTORCH_ROCM_ARCH env variable"
+ # List are separated by ; or space.
+ pytorch_rocm_arch = set(env_arch_list.replace(" ", ";").split(";"))
+ # Filter out the invalid architectures and print a warning.
+ arch_list = pytorch_rocm_arch.intersection(ROCM_SUPPORTED_ARCHS)
+ # If none of the specified architectures are valid, raise an error.
+ if not arch_list:
+ raise RuntimeError(
+ f"None of the ROCM architectures in {arch_source_str} "
+ f"({env_arch_list}) is supported. "
+ f"Supported ROCM architectures are: {ROCM_SUPPORTED_ARCHS}.")
+ invalid_arch_list = pytorch_rocm_arch - ROCM_SUPPORTED_ARCHS
+ if invalid_arch_list:
+ warnings.warn(
+ f"Unsupported ROCM architectures ({invalid_arch_list}) are "
+ f"excluded from the {arch_source_str} output "
+ f"({env_arch_list}). Supported ROCM architectures are: "
+ stacklevel=2)
+ return arch_list
+def get_torch_arch_list() -> Set[str]:
+ # TORCH_CUDA_ARCH_LIST can have one or more architectures,
+ # e.g. "8.0" or "7.5,8.0,8.6+PTX". Here, the "8.6+PTX" option asks the
+ # compiler to additionally include PTX code that can be runtime-compiled
+ # and executed on the 8.6 or newer architectures. While the PTX code will
+ # not give the best performance on the newer architectures, it provides
+ # forward compatibility.
+ env_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None)
+ if env_arch_list is None:
+ return set()
+ # List are separated by ; or space.
+ torch_arch_list = set(env_arch_list.replace(" ", ";").split(";"))
+ if not torch_arch_list:
+ return set()
+ # Filter out the invalid architectures and print a warning.
+ valid_archs = NVIDIA_SUPPORTED_ARCHS.union(
+ {s + "+PTX"
+ arch_list = torch_arch_list.intersection(valid_archs)
+ # If none of the specified architectures are valid, raise an error.
+ if not arch_list:
+ raise RuntimeError(
+ "None of the CUDA/ROCM architectures in `TORCH_CUDA_ARCH_LIST` "
+ f"env variable ({env_arch_list}) is supported. "
+ f"Supported CUDA architectures are: {valid_archs}.")
+ invalid_arch_list = torch_arch_list - valid_archs
+ if invalid_arch_list:
+ warnings.warn(
+ f"Unsupported CUDA/ROCM architectures ({invalid_arch_list}) are "
+ "excluded from the `TORCH_CUDA_ARCH_LIST` env variable "
+ f"({env_arch_list}). Supported CUDA/ROCM architectures are: "
+ f"{valid_archs}.",
+ stacklevel=2)
+ return arch_list
+if _is_hip():
+ rocm_arches = get_pytorch_rocm_arch()
+ NVCC_FLAGS += ["--offload-arch=" + arch for arch in rocm_arches]
+ # First, check the TORCH_CUDA_ARCH_LIST environment variable.
+ compute_capabilities = get_torch_arch_list()
+if _is_cuda() and not compute_capabilities:
+ # If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available
+ # GPUs on the current machine.
+ device_count = torch.cuda.device_count()
+ for i in range(device_count):
+ major, minor = torch.cuda.get_device_capability(i)
+ if major < 6:
+ raise RuntimeError(
+ "GPUs with compute capability below 6.0 are not supported.")
+ compute_capabilities.add(f"{major}.{minor}")
+ext_modules = []
+if _is_cuda():
+ nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME)
+ if not compute_capabilities:
+ # If no GPU is specified nor available, add all supported architectures
+ # based on the NVCC CUDA version.
+ compute_capabilities = NVIDIA_SUPPORTED_ARCHS.copy()
+ if nvcc_cuda_version < Version("11.1"):
+ compute_capabilities.remove("8.6")
+ if nvcc_cuda_version < Version("11.8"):
+ compute_capabilities.remove("8.9")
+ compute_capabilities.remove("9.0")
+ # Validate the NVCC CUDA version.
+ if nvcc_cuda_version < Version("11.0"):
+ raise RuntimeError(
+ "CUDA 11.0 or higher is required to build the package.")
+ if (nvcc_cuda_version < Version("11.1")
+ and any(cc.startswith("8.6") for cc in compute_capabilities)):
+ raise RuntimeError(
+ "CUDA 11.1 or higher is required for compute capability 8.6.")
+ if nvcc_cuda_version < Version("11.8"):
+ if any(cc.startswith("8.9") for cc in compute_capabilities):
+ # CUDA 11.8 is required to generate the code targeting compute
+ # capability 8.9. However, GPUs with compute capability 8.9 can
+ # also run the code generated by the previous versions of CUDA 11
+ # and targeting compute capability 8.0. Therefore, if CUDA 11.8 is
+ # not available, we target compute capability 8.0 instead of 8.9.
+ warnings.warn(
+ "CUDA 11.8 or higher is required for compute capability 8.9. "
+ "Targeting compute capability 8.0 instead.",
+ stacklevel=2)
+ compute_capabilities = set(cc for cc in compute_capabilities
+ if not cc.startswith("8.9"))
+ compute_capabilities.add("8.0+PTX")
+ if any(cc.startswith("9.0") for cc in compute_capabilities):
+ raise RuntimeError(
+ "CUDA 11.8 or higher is required for compute capability 9.0.")
+ # Add target compute capabilities to NVCC flags.
+ for capability in compute_capabilities:
+ num = capability[0] + capability[2]
+ NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=sm_{num}"]
+ if capability.endswith("+PTX"):
+ "-gencode", f"arch=compute_{num},code=compute_{num}"
+ ]
+ if int(capability[0]) >= 8:
+ "-gencode", f"arch=compute_{num},code=sm_{num}"
+ ]
+ if capability.endswith("+PTX"):
+ "-gencode", f"arch=compute_{num},code=compute_{num}"
+ ]
+ # Use NVCC threads to parallelize the build.
+ if nvcc_cuda_version >= Version("11.2"):
+ nvcc_threads = int(os.getenv("NVCC_THREADS", 8))
+ num_threads = min(os.cpu_count(), nvcc_threads)
+ NVCC_FLAGS += ["--threads", str(num_threads)]
+ if nvcc_cuda_version >= Version("11.8"):
+ # changes for punica kernels
+ ]
+ for flag in REMOVE_NVCC_FLAGS:
+ with contextlib.suppress(ValueError):
+ torch_cpp_ext.COMMON_NVCC_FLAGS.remove(flag)
+ install_punica = bool(
+ int(os.getenv("APHRODITE_INSTALL_PUNICA_KERNELS", "1")))
+ device_count = torch.cuda.device_count()
+ for i in range(device_count):
+ major, minor = torch.cuda.get_device_capability(i)
+ if major < 8:
+ install_punica = False
+ break
+ if install_punica:
+ ext_modules.append(
+ CUDAExtension(
+ name="aphrodite._punica_C",
+ sources=["kernels/punica/punica_ops.cc"] +
+ glob("kernels/punica/bgmv/*.cu"),
+ extra_compile_args={
+ "cxx": CXX_FLAGS,
+ },
+ ))
+ install_hadamard = bool(
+ device_count = torch.cuda.device_count()
+ for i in range(device_count):
+ major, minor = torch.cuda.get_device_capability(i)
+ if major < 7:
+ install_hadamard = False
+ break
+ if install_hadamard:
+ ext_modules.append(
+ CUDAExtension(
+ name="aphrodite._hadamard_C",
+ sources=[
+ "kernels/hadamard/fast_hadamard_transform.cpp",
+ "kernels/hadamard/fast_hadamard_transform_cuda.cu"
+ ],
+ extra_compile_args={
+ "cxx": CXX_FLAGS,
+ "nvcc": NVCC_FLAGS,
+ },
+ ))
+aphrodite_extension_sources = [
+ "kernels/cache_kernels.cu",
+ "kernels/attention/attention_kernels.cu",
+ "kernels/pos_encoding_kernels.cu",
+ "kernels/activation_kernels.cu",
+ "kernels/layernorm_kernels.cu",
+ "kernels/quantization/squeezellm/quant_cuda_kernel.cu",
+ "kernels/quantization/gguf/gguf_kernel.cu",
+ "kernels/quantization/gptq/q_gemm.cu",
+ "kernels/quantization/exl2/q_matrix.cu",
+ "kernels/quantization/exl2/q_gemm_exl2.cu",
+ "kernels/cuda_utils_kernels.cu",
+ "kernels/moe/align_block_size_kernel.cu",
+ "kernels/pybind.cpp",
+if _is_cuda():
+ aphrodite_extension_sources.append(
+ "kernels/quantization/awq/gemm_kernels.cu")
+ aphrodite_extension_sources.append(
+ "kernels/quantization/quip/origin_order.cu")
+ aphrodite_extension_sources.append(
+ "kernels/quantization/marlin/marlin_cuda_kernel.cu")
+ aphrodite_extension_sources.append(
+ "kernels/all_reduce/custom_all_reduce.cu")
+ aphrodite_extension_sources.append(
+ "kernels/quantization/aqlm/aqlm_cuda_entry.cpp")
+ aphrodite_extension_sources.append(
+ "kernels/quantization/aqlm/aqlm_cuda_kernel.cu")
+ aphrodite_extension_sources.append(
+ "kernels/quantization/bitsandbytes/int4_fp16_gemm_kernels.cu")
+ aphrodite_extension_sources.append(
+ "kernels/quantization/bitsandbytes/format.cu")
+ aphrodite_extension_sources.append(
+ "kernels/quantization/bitsandbytes/gemm_s4_f16.cu")
+ ext_modules.append(
+ CUDAExtension(
+ name="aphrodite._moe_C",
+ sources=glob("kernels/moe/*.cu") + glob("kernels/moe/*.cpp"),
+ extra_compile_args={
+ "cxx": CXX_FLAGS,
+ "nvcc": NVCC_FLAGS,
+ },
+ ))
+aphrodite_extension = CUDAExtension(
+ name="aphrodite._C",
+ sources=aphrodite_extension_sources,
+ extra_compile_args={
+ "cxx": CXX_FLAGS,
+ "nvcc": NVCC_FLAGS,
+ },
+ libraries=[
+ "cuda", "conda/envs/aphrodite-runtime/lib",
+ "conda/envs/aphrodite-runtime/lib/stubs"
+ ] if _is_cuda() else [],
+ library_dirs=[
+ "conda/envs/aphrodite-runtime/lib",
+ "conda/envs/aphrodite-runtime/lib/stubs"
+ ] if _is_cuda() else [],
+def get_path(*filepath) -> str:
+ return os.path.join(ROOT_DIR, *filepath)
+def find_version(filepath: str) -> str:
+ """Extract version information from the given filepath.
+ Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py
+ """
+ with open(filepath) as fp:
+ version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
+ fp.read(), re.M)
+ if version_match:
+ return version_match.group(1)
+ raise RuntimeError("Unable to find version string.")
def get_aphrodite_version() -> str:
version = find_version(get_path("aphrodite", "__init__.py"))
@@ -235,7 +397,7 @@ def get_aphrodite_version() -> str:
rocm_version_str = hipcc_version.replace(".", "")[:3]
version += f"+rocm{rocm_version_str}"
- cuda_version = str(get_nvcc_cuda_version())
+ cuda_version = str(nvcc_cuda_version)
if cuda_version != MAIN_CUDA_VERSION:
cuda_version_str = cuda_version.replace(".", "")[:3]
version += f"+cu{cuda_version_str}"
@@ -263,19 +425,7 @@ def get_requirements() -> List[str]:
return requirements
-ext_modules = []
-if _is_cuda():
- ext_modules.append(CMakeExtension(name="aphrodite._moe_C"))
- if _install_punica():
- ext_modules.append(CMakeExtension(name="aphrodite._punica_C"))
- if _install_hadamard():
- ext_modules.append(CMakeExtension(name="aphrodite._hadamard_C"))
- ext_modules.append(CMakeExtension(name="aphrodite._C"))
@@ -298,12 +448,12 @@ setup(
"License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)", # noqa: E501
"Topic :: Scientific/Engineering :: Artificial Intelligence",
- packages=find_packages(exclude=("kernels", "examples",
+ packages=setuptools.find_packages(exclude=("kernels", "examples",
- cmdclass={"build_ext": cmake_build_ext},
+ cmdclass={"build_ext": BuildExtension},
"aphrodite": [