123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198 |
- #include <cudaTypedefs.h>
- #include <c10/cuda/CUDAGuard.h>
- #include <torch/all.h>
- void cutlass_scaled_mm_sm75(torch::Tensor& c, torch::Tensor const& a,
- torch::Tensor const& b,
- torch::Tensor const& a_scales,
- torch::Tensor const& b_scales,
- c10::optional<torch::Tensor> const& bias);
- void cutlass_scaled_mm_sm80(torch::Tensor& c, torch::Tensor const& a,
- torch::Tensor const& b,
- torch::Tensor const& a_scales,
- torch::Tensor const& b_scales,
- c10::optional<torch::Tensor> const& bias);
- void cutlass_scaled_mm_sm89(torch::Tensor& c, torch::Tensor const& a,
- torch::Tensor const& b,
- torch::Tensor const& a_scales,
- torch::Tensor const& b_scales,
- c10::optional<torch::Tensor> const& bias);
- #if defined CUDA_VERSION && CUDA_VERSION >= 12000
- void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
- torch::Tensor const& b,
- torch::Tensor const& a_scales,
- torch::Tensor const& b_scales,
- c10::optional<torch::Tensor> const& bias);
- #endif
- void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a,
- torch::Tensor const& b,
- torch::Tensor const& a_scales,
- torch::Tensor const& b_scales,
- torch::Tensor const& azp_adj,
- c10::optional<torch::Tensor> const& azp,
- c10::optional<torch::Tensor> const& bias);
- void cutlass_scaled_mm_azp_sm80(torch::Tensor& c, torch::Tensor const& a,
- torch::Tensor const& b,
- torch::Tensor const& a_scales,
- torch::Tensor const& b_scales,
- torch::Tensor const& azp_adj,
- c10::optional<torch::Tensor> const& azp,
- c10::optional<torch::Tensor> const& bias);
- void cutlass_scaled_mm_azp_sm89(torch::Tensor& c, torch::Tensor const& a,
- torch::Tensor const& b,
- torch::Tensor const& a_scales,
- torch::Tensor const& b_scales,
- torch::Tensor const& azp_adj,
- c10::optional<torch::Tensor> const& azp,
- c10::optional<torch::Tensor> const& bias);
- #if defined CUDA_VERSION && CUDA_VERSION >= 12000
- void cutlass_scaled_mm_azp_sm90(torch::Tensor& c, torch::Tensor const& a,
- torch::Tensor const& b,
- torch::Tensor const& a_scales,
- torch::Tensor const& b_scales,
- torch::Tensor const& azp_adj,
- c10::optional<torch::Tensor> const& azp,
- c10::optional<torch::Tensor> const& bias);
- #endif
- bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
- // CUTLASS FP8 kernels need at least
- // CUDA 12.0 on SM90 systems (Hopper)
- // CUDA 12.4 on SM89 systems (Lovelace)
- #if defined CUDA_VERSION
- if (cuda_device_capability >= 90) {
- return CUDA_VERSION >= 12000;
- } else if (cuda_device_capability >= 89) {
- return CUDA_VERSION >= 12040;
- }
- #endif
- return false;
- }
- int32_t get_sm_version_num() {
- int32_t major_capability, minor_capability;
- cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
- 0);
- cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
- 0);
- int32_t version_num = major_capability * 10 + minor_capability;
- return version_num;
- }
- void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
- torch::Tensor const& b, torch::Tensor const& a_scales,
- torch::Tensor const& b_scales,
- c10::optional<torch::Tensor> const& bias) {
- // Checks for conformality
- TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
- TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
- b.size(1) == c.size(1));
- TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
- TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
- // Check for strides and alignment
- TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1); // Row-major
- TORCH_CHECK(b.stride(0) == 1); // Column-major
- TORCH_CHECK(c.stride(0) % 16 == 0 &&
- b.stride(1) % 16 == 0); // 16 Byte Alignment
- TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
- if (bias) {
- TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
- bias->dim() == 1);
- }
- at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
- int32_t version_num = get_sm_version_num();
- if (version_num >= 90) {
- // Hopper
- // Guard against compilation issues for sm90 kernels
- #if defined CUDA_VERSION && CUDA_VERSION >= 12000
- cutlass_scaled_mm_sm90(c, a, b, a_scales, b_scales, bias);
- #else
- cutlass_scaled_mm_sm80(c, a, b, a_scales, b_scales, bias);
- #endif
- } else if (version_num == 89) {
- // Ada Lovelace
- cutlass_scaled_mm_sm89(c, a, b, a_scales, b_scales, bias);
- } else if (version_num >= 80) {
- // Ampere
- cutlass_scaled_mm_sm80(c, a, b, a_scales, b_scales, bias);
- } else {
- // Turing
- TORCH_CHECK(version_num >= 75);
- cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales, bias);
- }
- }
- void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
- torch::Tensor const& b,
- torch::Tensor const& a_scales,
- torch::Tensor const& b_scales,
- torch::Tensor const& azp_adj,
- c10::optional<torch::Tensor> const& azp,
- c10::optional<torch::Tensor> const& bias) {
- // Checks for conformality
- TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
- TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
- b.size(1) == c.size(1));
- TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
- TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
- // Check for strides and alignment
- TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1); // Row-major
- TORCH_CHECK(b.stride(0) == 1); // Column-major
- TORCH_CHECK(c.stride(0) % 16 == 0 &&
- b.stride(1) % 16 == 0); // 16 Byte Alignment
- TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
- // bias, azp, azp_adj are all 1d
- // bias and azp_adj have n elements, azp has m elements
- if (bias) {
- TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous());
- }
- if (azp) {
- TORCH_CHECK(azp->numel() == a.size(0) && azp->is_contiguous());
- }
- TORCH_CHECK(azp_adj.numel() == b.size(1) && azp_adj.is_contiguous());
- // azp & bias types
- TORCH_CHECK(azp_adj.dtype() == torch::kInt32);
- TORCH_CHECK(!azp || azp->dtype() == torch::kInt32);
- TORCH_CHECK(!bias || bias->dtype() == c.dtype(),
- "currently bias dtype must match output dtype ", c.dtype());
- at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
- int32_t version_num = get_sm_version_num();
- if (version_num >= 90) {
- // Hopper
- // Guard against compilation issues for sm90 kernels
- #if defined CUDA_VERSION && CUDA_VERSION >= 12000
- cutlass_scaled_mm_azp_sm90(c, a, b, a_scales, b_scales, azp_adj, azp, bias);
- #else
- cutlass_scaled_mm_azp_sm80(c, a, b, a_scales, b_scales, azp_adj, azp, bias);
- #endif
- } else if (version_num == 89) {
- // Ada Lovelace
- cutlass_scaled_mm_azp_sm89(c, a, b, a_scales, b_scales, azp_adj, azp, bias);
- } else if (version_num >= 80) {
- // Ampere
- cutlass_scaled_mm_azp_sm80(c, a, b, a_scales, b_scales, azp_adj, azp, bias);
- } else {
- // Turing
- TORCH_CHECK(version_num >= 75);
- cutlass_scaled_mm_azp_sm75(c, a, b, a_scales, b_scales, azp_adj, azp, bias);
- }
- }
|