123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287 |
- import io
- import os
- import re
- import subprocess
- from typing import List, Set
- import warnings
- from packaging.version import parse, Version
- import setuptools
- import torch
- from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME
- ROOT_DIR = os.path.dirname(__file__)
- # Supported NVIDIA GPU architectures.
- SUPPORTED_ARCHS = {"6.0", "6.1", "6.5", "7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
- # Compiler flags.
- CXX_FLAGS = ["-g", "-O2", "-std=c++17"]
- # TODO: Should we use -O3?
- NVCC_FLAGS = ["-O2", "-std=c++17"]
- ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0
- CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
- NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
- if CUDA_HOME is None:
- raise RuntimeError(
- "Cannot find CUDA_HOME. CUDA must be available to build the package.")
- def get_nvcc_cuda_version(cuda_dir: str) -> Version:
- """Get the CUDA version from nvcc.
- Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
- """
- nvcc_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
- universal_newlines=True)
- output = nvcc_output.split()
- release_idx = output.index("release") + 1
- nvcc_cuda_version = parse(output[release_idx].split(",")[0])
- return nvcc_cuda_version
- def get_torch_arch_list() -> Set[str]:
- # TORCH_CUDA_ARCH_LIST can have one or more architectures,
- # e.g. "8.0" or "7.5,8.0,8.6+PTX". Here, the "8.6+PTX" option asks the
- # compiler to additionally include PTX code that can be runtime-compiled
- # and executed on the 8.6 or newer architectures. While the PTX code will
- # not give the best performance on the newer architectures, it provides
- # forward compatibility.
- env_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None)
- if env_arch_list is None:
- return set()
- # List are separated by ; or space.
- torch_arch_list = set(env_arch_list.replace(" ", ";").split(";"))
- if not torch_arch_list:
- return set()
- # Filter out the invalid architectures and print a warning.
- valid_archs = SUPPORTED_ARCHS.union({s + "+PTX" for s in SUPPORTED_ARCHS})
- arch_list = torch_arch_list.intersection(valid_archs)
- # If none of the specified architectures are valid, raise an error.
- if not arch_list:
- raise RuntimeError(
- "None of the CUDA architectures in `TORCH_CUDA_ARCH_LIST` env "
- f"variable ({env_arch_list}) is supported. "
- f"Supported CUDA architectures are: {valid_archs}.")
- invalid_arch_list = torch_arch_list - valid_archs
- if invalid_arch_list:
- warnings.warn(
- f"Unsupported CUDA architectures ({invalid_arch_list}) are "
- "excluded from the `TORCH_CUDA_ARCH_LIST` env variable "
- f"({env_arch_list}). Supported CUDA architectures are: "
- f"{valid_archs}.")
- return arch_list
- # First, check the TORCH_CUDA_ARCH_LIST environment variable.
- compute_capabilities = get_torch_arch_list()
- if not compute_capabilities:
- # If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available
- # GPUs on the current machine.
- device_count = torch.cuda.device_count()
- for i in range(device_count):
- major, minor = torch.cuda.get_device_capability(i)
- if major < 6:
- raise RuntimeError(
- "GPUs with compute capability below 6.0 are not supported.")
- compute_capabilities.add(f"{major}.{minor}")
- nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME)
- if not compute_capabilities:
- # If no GPU is specified nor available, add all supported architectures
- # based on the NVCC CUDA version.
- compute_capabilities = SUPPORTED_ARCHS.copy()
- if nvcc_cuda_version < Version("11.1"):
- compute_capabilities.remove("8.6")
- if nvcc_cuda_version < Version("11.8"):
- compute_capabilities.remove("8.9")
- compute_capabilities.remove("9.0")
- # Validate the NVCC CUDA version.
- if nvcc_cuda_version < Version("11.0"):
- raise RuntimeError("CUDA 11.0 or higher is required to build the package.")
- if nvcc_cuda_version < Version("11.1"):
- if any(cc.startswith("8.6") for cc in compute_capabilities):
- raise RuntimeError(
- "CUDA 11.1 or higher is required for compute capability 8.6.")
- if nvcc_cuda_version < Version("11.8"):
- if any(cc.startswith("8.9") for cc in compute_capabilities):
- # CUDA 11.8 is required to generate the code targeting compute capability 8.9.
- # However, GPUs with compute capability 8.9 can also run the code generated by
- # the previous versions of CUDA 11 and targeting compute capability 8.0.
- # Therefore, if CUDA 11.8 is not available, we target compute capability 8.0
- # instead of 8.9.
- warnings.warn(
- "CUDA 11.8 or higher is required for compute capability 8.9. "
- "Targeting compute capability 8.0 instead.")
- compute_capabilities = set(cc for cc in compute_capabilities
- if not cc.startswith("8.9"))
- compute_capabilities.add("8.0+PTX")
- if any(cc.startswith("9.0") for cc in compute_capabilities):
- raise RuntimeError(
- "CUDA 11.8 or higher is required for compute capability 9.0.")
- # Add target compute capabilities to NVCC flags.
- for capability in compute_capabilities:
- num = capability[0] + capability[2]
- NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=sm_{num}"]
- if capability.endswith("+PTX"):
- NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=compute_{num}"]
- # Use NVCC threads to parallelize the build.
- if nvcc_cuda_version >= Version("11.2"):
- num_threads = min(os.cpu_count(), 8)
- NVCC_FLAGS += ["--threads", str(num_threads)]
- ext_modules = []
- # Cache operations.
- cache_extension = CUDAExtension(
- name="aphrodite.cache_ops",
- sources=["kernels/cache.cpp", "kernels/cache_kernels.cu"],
- extra_compile_args={
- "cxx": CXX_FLAGS,
- "nvcc": NVCC_FLAGS,
- },
- )
- ext_modules.append(cache_extension)
- # Attention kernels.
- attention_extension = CUDAExtension(
- name="aphrodite.attention_ops",
- sources=["kernels/attention.cpp", "kernels/attention/attention_kernels.cu"],
- extra_compile_args={
- "cxx": CXX_FLAGS,
- "nvcc": NVCC_FLAGS,
- },
- )
- ext_modules.append(attention_extension)
- # Positional encoding kernels.
- positional_encoding_extension = CUDAExtension(
- name="aphrodite.pos_encoding_ops",
- sources=["kernels/pos_encoding.cpp", "kernels/pos_encoding_kernels.cu"],
- extra_compile_args={
- "cxx": CXX_FLAGS,
- "nvcc": NVCC_FLAGS,
- },
- )
- ext_modules.append(positional_encoding_extension)
- # Layer normalization kernels.
- layernorm_extension = CUDAExtension(
- name="aphrodite.layernorm_ops",
- sources=["kernels/layernorm.cpp", "kernels/layernorm_kernels.cu"],
- extra_compile_args={
- "cxx": CXX_FLAGS,
- "nvcc": NVCC_FLAGS,
- },
- )
- ext_modules.append(layernorm_extension)
- # Activation kernels.
- activation_extension = CUDAExtension(
- name="aphrodite.activation_ops",
- sources=["kernels/activation.cpp", "kernels/activation_kernels.cu"],
- extra_compile_args={
- "cxx": CXX_FLAGS,
- "nvcc": NVCC_FLAGS,
- },
- )
- ext_modules.append(activation_extension)
- # Quantization kernels.
- quantization_extension = CUDAExtension(
- name="aphrodite.quantization_ops",
- sources=[
- "kernels/quantization.cpp", "kernels/quantization/awq/gemm_kernels.cu",
- "kernels/quantization/gptq/exllama_ext.cpp",
- "kernels/quantization/gptq/cuda_buffers.cu",
- "kernels/quantization/gptq/cuda_func/column_remap.cu",
- "kernels/quantization/gptq/cuda_func/q4_matmul.cu",
- "kernels/quantization/gptq/cuda_func/q4_matrix.cu",
- "kernels/quantization/gptq/alt_matmul_kernel.cu",
- "kernels/quantization/gptq/alt_matmul.cpp"
- ],
- extra_compile_args={
- "cxx": CXX_FLAGS,
- "nvcc": NVCC_FLAGS,
- },
- )
- ext_modules.append(quantization_extension)
- # Misc. CUDA utils.
- cuda_utils_extension = CUDAExtension(
- name="aphrodite.cuda_utils",
- sources=["kernels/cuda_utils.cpp", "kernels/cuda_utils_kernels.cu"],
- extra_compile_args={
- "cxx": CXX_FLAGS,
- "nvcc": NVCC_FLAGS,
- },
- )
- ext_modules.append(cuda_utils_extension)
- def get_path(*filepath) -> str:
- return os.path.join(ROOT_DIR, *filepath)
- def find_version(filepath: str):
- """Extract version information from the given filepath.
- Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py
- """
- with open(filepath) as fp:
- version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
- fp.read(), re.M)
- if version_match:
- return version_match.group(1)
- raise RuntimeError("Unable to find version string.")
- def read_readme() -> str:
- """Read the README file."""
- return io.open(get_path("README.md"), "r", encoding="utf-8").read()
- def get_requirements() -> List[str]:
- """Get Python package dependencies from requirements.txt."""
- with open(get_path("requirements.txt")) as f:
- requirements = f.read().strip().split("\n")
- return requirements
- setuptools.setup(
- name="aphrodite-engine",
- version=find_version(get_path("aphrodite", "__init__.py")),
- author="PygmalionAI",
- license="AGPL 3.0",
- description="The inference engine for PygmalionAI models",
- long_description=read_readme(),
- long_description_content_type="text/markdown",
- url="https://github.com/PygmalionAI/aphrodite-engine",
- project_urls={
- "Homepage": "https://pygmalion.chat",
- "Documentation": "https://docs.pygmalion.chat",
- "GitHub": "https://github.com/PygmalionAI",
- "Huggingface": "https://huggingface.co/PygmalionAI",
- },
- classifiers=[
- "Programming Language :: Python :: 3.8",
- "Programming Language :: Python :: 3.9",
- "Programming Language :: Python :: 3.10",
- "Programming Language :: Python :: 3.11",
- "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)",
- "Topic :: Scientific/Engineering :: Artificial Intelligence",
- ],
- packages=setuptools.find_packages(
- exclude=("kernels","examples", "tests")),
- python_requires=">=3.8",
- install_requires=get_requirements(),
- ext_modules=ext_modules,
- cmdclass={"build_ext": BuildExtension},
- )
|