123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931 |
- #include <torch/all.h>
- #include <ATen/cuda/CUDAContext.h>
- #include <c10/cuda/CUDAGuard.h>
- #include "causal_conv1d.h"
- #include <c10/util/BFloat16.h>
- #include <c10/util/Half.h>
- #include <c10/cuda/CUDAException.h> // For C10_CUDA_CHECK and C10_CUDA_KERNEL_LAUNCH_CHECK
- #include <cub/block/block_load.cuh>
- #include <cub/block/block_store.cuh>
- #include "static_switch.h"
- #define CHECK_SHAPE(x, ...) \
- TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), \
- #x " must have shape (" #__VA_ARGS__ ")")
- #define DISPATCH_ITYPE_FLOAT_AND_HALF_AND_BF16(ITYPE, NAME, ...) \
- if (ITYPE == at::ScalarType::Half) { \
- using input_t = at::Half; \
- __VA_ARGS__(); \
- } else if (ITYPE == at::ScalarType::BFloat16) { \
- using input_t = at::BFloat16; \
- __VA_ARGS__(); \
- } else if (ITYPE == at::ScalarType::Float) { \
- using input_t = float; \
- __VA_ARGS__(); \
- } else { \
- AT_ERROR(#NAME, " not implemented for input type '", toString(ITYPE), \
- "'"); \
- }
- #define DISPATCH_WTYPE_FLOAT_AND_HALF_AND_BF16(WTYPE, NAME, ...) \
- if (WTYPE == at::ScalarType::Half) { \
- using weight_t = at::Half; \
- __VA_ARGS__(); \
- } else if (WTYPE == at::ScalarType::BFloat16) { \
- using weight_t = at::BFloat16; \
- __VA_ARGS__(); \
- } else if (WTYPE == at::ScalarType::Float) { \
- using weight_t = float; \
- __VA_ARGS__(); \
- } else { \
- AT_ERROR(#NAME, " not implemented for weight type '", toString(WTYPE), \
- "'"); \
- }
- template <typename input_t, typename weight_t>
- void causal_conv1d_fwd_cuda(ConvParamsBase& params, cudaStream_t stream);
- template <typename input_t, typename weight_t>
- void causal_conv1d_channellast_fwd_cuda(ConvParamsBase& params,
- cudaStream_t stream);
- template <typename input_t, typename weight_t>
- void causal_conv1d_update_cuda(ConvParamsBase& params, cudaStream_t stream);
- void set_conv_params_fwd(ConvParamsBase& params,
- // sizes
- const size_t batch, const size_t dim,
- const size_t seqlen, const size_t width,
- // device pointers
- const at::Tensor x, const at::Tensor weight,
- const at::Tensor out, void* bias_ptr,
- bool silu_activation) {
- // Reset the parameters
- memset(¶ms, 0, sizeof(params));
- params.batch = batch;
- params.dim = dim;
- params.seqlen = seqlen;
- params.width = width;
- params.silu_activation = silu_activation;
- // Set the pointers and strides.
- params.x_ptr = x.data_ptr();
- params.weight_ptr = weight.data_ptr();
- params.bias_ptr = bias_ptr;
- params.out_ptr = out.data_ptr();
- // All stride are in elements, not bytes.
- params.x_batch_stride = x.stride(0);
- params.x_c_stride = x.stride(1);
- params.x_l_stride = x.stride(-1);
- params.weight_c_stride = weight.stride(0);
- params.weight_width_stride = weight.stride(1);
- params.out_batch_stride = out.stride(0);
- params.out_c_stride = out.stride(1);
- params.out_l_stride = out.stride(-1);
- }
- at::Tensor causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight,
- const c10::optional<at::Tensor>& bias_,
- const c10::optional<at::Tensor>& seq_idx_,
- const c10::optional<at::Tensor>& seq_pos_idx_,
- const c10::optional<at::Tensor>& initial_states_,
- const c10::optional<at::Tensor>& final_states_out_,
- bool silu_activation) {
- auto input_type = x.scalar_type();
- auto weight_type = weight.scalar_type();
- TORCH_CHECK(input_type == at::ScalarType::Float ||
- input_type == at::ScalarType::Half ||
- input_type == at::ScalarType::BFloat16);
- TORCH_CHECK(weight_type == at::ScalarType::Float ||
- weight_type == at::ScalarType::Half ||
- weight_type == at::ScalarType::BFloat16);
- TORCH_CHECK(x.is_cuda());
- TORCH_CHECK(weight.is_cuda());
- const auto sizes = x.sizes();
- const int batch_size = sizes[0];
- const int dim = sizes[1];
- const int seqlen = sizes[2];
- const int width = weight.size(-1);
- CHECK_SHAPE(x, batch_size, dim, seqlen);
- CHECK_SHAPE(weight, dim, width);
- TORCH_CHECK(x.stride(2) == 1 || x.stride(1) == 1);
- const bool is_channel_last = x.stride(1) == 1 && x.stride(2) > 1;
- if (is_channel_last) {
- TORCH_CHECK(
- dim % 8 == 0,
- "causal_conv1d only supports channel dimension divisible by 8 for now");
- TORCH_CHECK(x.stride(2) % 8 == 0 && x.stride(0) % 8 == 0,
- "causal_conv1d with channel last layout requires strides "
- "(x.stride(0) and x.stride(2)) to be multiples of 8");
- }
- TORCH_CHECK(width >= 2 && width <= 4,
- "causal_conv1d only supports width between 2 and 4");
- if (bias_.has_value()) {
- auto bias = bias_.value();
- TORCH_CHECK(bias.scalar_type() == weight_type);
- TORCH_CHECK(bias.is_cuda());
- TORCH_CHECK(bias.stride(-1) == 1);
- CHECK_SHAPE(bias, dim);
- }
- if (seq_idx_.has_value()) {
- TORCH_CHECK(is_channel_last,
- "seq_idx is only supported for channel last layout");
- auto seq_idx = seq_idx_.value();
- TORCH_CHECK(seq_idx.scalar_type() == torch::kInt32);
- TORCH_CHECK(seq_idx.is_cuda());
- TORCH_CHECK(seq_idx.is_contiguous());
- CHECK_SHAPE(seq_idx, batch_size, seqlen);
- }
- if (seq_pos_idx_.has_value()) {
- auto seq_pos_idx = seq_pos_idx_.value();
- TORCH_CHECK(seq_pos_idx.scalar_type() == torch::kInt32);
- TORCH_CHECK(seq_pos_idx.is_cuda());
- TORCH_CHECK(seq_pos_idx.is_contiguous());
- CHECK_SHAPE(seq_pos_idx, batch_size, seqlen);
- }
- at::Tensor out = torch::empty_like(x);
- ConvParamsBase params;
- set_conv_params_fwd(params, batch_size, dim, seqlen, width, x, weight, out,
- bias_.has_value() ? bias_.value().data_ptr() : nullptr,
- silu_activation);
- if (seq_idx_.has_value()) {
- params.seq_idx_ptr = seq_idx_.value().data_ptr();
- } else {
- params.seq_idx_ptr = nullptr;
- }
- if (seq_pos_idx_.has_value()) {
- params.seq_pos_idx_ptr = seq_pos_idx_.value().data_ptr();
- } else {
- params.seq_pos_idx_ptr = nullptr;
- }
- if (initial_states_.has_value()) {
- TORCH_CHECK(is_channel_last,
- "initial_states is only supported for channel last layout");
- auto initial_states = initial_states_.value();
- TORCH_CHECK(initial_states.scalar_type() == input_type);
- TORCH_CHECK(initial_states.is_cuda());
- CHECK_SHAPE(initial_states, batch_size, dim, width - 1);
- TORCH_CHECK(initial_states.stride(1) == 1);
- params.initial_states_ptr = initial_states.data_ptr();
- params.initial_states_batch_stride = initial_states.stride(0);
- params.initial_states_c_stride = initial_states.stride(1);
- params.initial_states_l_stride = initial_states.stride(2);
- } else {
- params.initial_states_ptr = nullptr;
- }
- if (final_states_out_.has_value()) {
- TORCH_CHECK(is_channel_last,
- "final_states is only supported for channel last layout");
- auto final_states = final_states_out_.value();
- TORCH_CHECK(final_states.scalar_type() == input_type);
- TORCH_CHECK(final_states.is_cuda());
- CHECK_SHAPE(final_states, batch_size, dim, width - 1);
- TORCH_CHECK(final_states.stride(1) == 1);
- params.final_states_ptr = final_states.data_ptr();
- params.final_states_batch_stride = final_states.stride(0);
- params.final_states_c_stride = final_states.stride(1);
- params.final_states_l_stride = final_states.stride(2);
- } else {
- params.final_states_ptr = nullptr;
- }
- // Otherwise the kernel will be launched from cuda:0 device
- // Cast to char to avoid compiler warning about narrowing
- at::cuda::CUDAGuard device_guard{(char)x.get_device()};
- auto stream = at::cuda::getCurrentCUDAStream().stream();
- DISPATCH_ITYPE_FLOAT_AND_HALF_AND_BF16(
- x.scalar_type(), "causal_conv1d_fwd", [&] {
- DISPATCH_WTYPE_FLOAT_AND_HALF_AND_BF16(
- weight.scalar_type(), "causal_conv1d_fwd", [&] {
- if (!is_channel_last) {
- causal_conv1d_fwd_cuda<input_t, weight_t>(params, stream);
- } else {
- causal_conv1d_channellast_fwd_cuda<input_t, weight_t>(params,
- stream);
- }
- });
- });
- return out;
- }
- at::Tensor causal_conv1d_update(const at::Tensor& x,
- const at::Tensor& conv_state,
- const at::Tensor& weight,
- const c10::optional<at::Tensor>& bias_,
- bool silu_activation) {
- auto input_type = x.scalar_type();
- auto weight_type = weight.scalar_type();
- TORCH_CHECK(input_type == at::ScalarType::Float ||
- input_type == at::ScalarType::Half ||
- input_type == at::ScalarType::BFloat16);
- TORCH_CHECK(weight_type == at::ScalarType::Float ||
- weight_type == at::ScalarType::Half ||
- weight_type == at::ScalarType::BFloat16);
- TORCH_CHECK(conv_state.scalar_type() == input_type);
- TORCH_CHECK(x.is_cuda());
- TORCH_CHECK(conv_state.is_cuda());
- TORCH_CHECK(weight.is_cuda());
- const auto sizes = x.sizes();
- const int batch_size = sizes[0];
- const int dim = sizes[1];
- const int width = weight.size(-1);
- CHECK_SHAPE(x, batch_size, dim);
- CHECK_SHAPE(conv_state, batch_size, dim, width);
- CHECK_SHAPE(weight, dim, width);
- TORCH_CHECK(width >= 2 && width <= 4,
- "causal_conv1d only supports width between 2 and 4");
- if (bias_.has_value()) {
- auto bias = bias_.value();
- TORCH_CHECK(bias.scalar_type() == weight_type);
- TORCH_CHECK(bias.is_cuda());
- TORCH_CHECK(bias.stride(-1) == 1);
- CHECK_SHAPE(bias, dim);
- }
- at::Tensor out = torch::empty_like(x);
- ConvParamsBase params;
- set_conv_params_fwd(
- params, batch_size, dim, /*seqlen=*/1, width, x, weight, out,
- bias_.has_value() ? bias_.value().data_ptr() : nullptr, silu_activation);
- params.conv_state_ptr = conv_state.data_ptr();
- // All stride are in elements, not bytes.
- params.conv_state_batch_stride = conv_state.stride(0);
- params.conv_state_c_stride = conv_state.stride(1);
- params.conv_state_l_stride = conv_state.stride(2);
- // Otherwise the kernel will be launched from cuda:0 device
- // Cast to char to avoid compiler warning about narrowing
- at::cuda::CUDAGuard device_guard{(char)x.get_device()};
- auto stream = at::cuda::getCurrentCUDAStream().stream();
- DISPATCH_ITYPE_FLOAT_AND_HALF_AND_BF16(
- x.scalar_type(), "causal_conv1d_update", [&] {
- DISPATCH_WTYPE_FLOAT_AND_HALF_AND_BF16(
- weight.scalar_type(), "causal_conv1d_update", [&] {
- causal_conv1d_update_cuda<input_t, weight_t>(params, stream);
- });
- });
- return out;
- }
- template <int kNThreads_, int kWidth_, bool kIsVecLoad_, typename input_t_,
- typename weight_t_>
- struct Causal_conv1d_fwd_kernel_traits {
- using input_t = input_t_;
- using weight_t = weight_t_;
- static constexpr int kNThreads = kNThreads_;
- static constexpr int kWidth = kWidth_;
- static constexpr int kNBytes = sizeof(input_t);
- static_assert(kNBytes == 2 || kNBytes == 4);
- static constexpr int kNElts = kNBytes == 4 ? 4 : 8;
- static_assert(kWidth <= kNElts);
- static constexpr bool kIsVecLoad = kIsVecLoad_;
- static constexpr int kNLoadsIndex = kNElts / 4;
- using vec_t = typename BytesToType<kNBytes * kNElts>::Type;
- using BlockLoadT = cub::BlockLoad<input_t, kNThreads, kNElts,
- cub::BLOCK_LOAD_WARP_TRANSPOSE>;
- using BlockLoadVecT =
- cub::BlockLoad<vec_t, kNThreads, 1, cub::BLOCK_LOAD_DIRECT>;
- using BlockLoadIndexT =
- cub::BlockLoad<int, kNThreads, kNElts, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
- using BlockLoadIndexVecT = cub::BlockLoad<int4, kNThreads, kNLoadsIndex,
- !(kIsVecLoad && kNLoadsIndex == 1)
- ? cub::BLOCK_LOAD_WARP_TRANSPOSE
- : cub::BLOCK_LOAD_DIRECT>;
- using BlockStoreT = cub::BlockStore<input_t, kNThreads, kNElts,
- cub::BLOCK_STORE_WARP_TRANSPOSE>;
- using BlockStoreVecT =
- cub::BlockStore<vec_t, kNThreads, 1, cub::BLOCK_STORE_DIRECT>;
- static constexpr int kSmemIOSize =
- (kIsVecLoad && kNLoadsIndex == 1)
- ? 0
- : std::max({sizeof(typename BlockLoadT::TempStorage),
- sizeof(typename BlockStoreT::TempStorage),
- sizeof(typename BlockLoadIndexT::TempStorage),
- sizeof(typename BlockLoadIndexVecT::TempStorage)});
- static constexpr int kSmemExchangeSize = kNThreads * kNBytes * kNElts;
- static constexpr int kSmemSize = kSmemIOSize + kSmemExchangeSize;
- };
- template <typename Ktraits, bool kHasSeqPosIdx>
- __global__ __launch_bounds__(Ktraits::kNThreads) void causal_conv1d_fwd_kernel(
- ConvParamsBase params) {
- constexpr int kWidth = Ktraits::kWidth;
- constexpr int kNThreads = Ktraits::kNThreads;
- constexpr int kNElts = Ktraits::kNElts;
- static constexpr bool kIsVecLoad = Ktraits::kIsVecLoad;
- using input_t = typename Ktraits::input_t;
- using vec_t = typename Ktraits::vec_t;
- using weight_t = typename Ktraits::weight_t;
- // Shared memory.
- extern __shared__ char smem_[];
- [[maybe_unused]] auto& smem_load =
- reinterpret_cast<typename Ktraits::BlockLoadT::TempStorage&>(smem_);
- [[maybe_unused]] auto& smem_load_vec =
- reinterpret_cast<typename Ktraits::BlockLoadVecT::TempStorage&>(smem_);
- [[maybe_unused]] auto& smem_load_index =
- reinterpret_cast<typename Ktraits::BlockLoadIndexT::TempStorage&>(smem_);
- [[maybe_unused]] auto& smem_load_index_vec =
- reinterpret_cast<typename Ktraits::BlockLoadIndexVecT::TempStorage&>(
- smem_);
- [[maybe_unused]] auto& smem_store =
- reinterpret_cast<typename Ktraits::BlockStoreT::TempStorage&>(smem_);
- [[maybe_unused]] auto& smem_store_vec =
- reinterpret_cast<typename Ktraits::BlockStoreVecT::TempStorage&>(smem_);
- vec_t* smem_exchange = reinterpret_cast<vec_t*>(smem_ + Ktraits::kSmemIOSize);
- const int tidx = threadIdx.x;
- const int batch_id = blockIdx.x;
- const int channel_id = blockIdx.y;
- input_t* x = reinterpret_cast<input_t*>(params.x_ptr) +
- batch_id * params.x_batch_stride +
- channel_id * params.x_c_stride;
- weight_t* weight = reinterpret_cast<weight_t*>(params.weight_ptr) +
- channel_id * params.weight_c_stride;
- input_t* out = reinterpret_cast<input_t*>(params.out_ptr) +
- batch_id * params.out_batch_stride +
- channel_id * params.out_c_stride;
- float bias_val =
- params.bias_ptr == nullptr
- ? 0.f
- : float(reinterpret_cast<weight_t*>(params.bias_ptr)[channel_id]);
- int* seq_pos_idx = !kHasSeqPosIdx
- ? nullptr
- : reinterpret_cast<int*>(params.seq_pos_idx_ptr) +
- batch_id * params.seqlen;
- // Thread 0 will load the last elements of the previous chunk, so we
- // initialize those to 0.
- if (tidx == 0) {
- input_t zeros[kNElts] = {0};
- smem_exchange[kNThreads - 1] = reinterpret_cast<vec_t*>(zeros)[0];
- }
- float weight_vals[kWidth];
- #pragma unroll
- for (int i = 0; i < kWidth; ++i) {
- weight_vals[i] = float(weight[i * params.weight_width_stride]);
- }
- constexpr int kChunkSize = kNThreads * kNElts;
- const int n_chunks = (params.seqlen + kChunkSize - 1) / kChunkSize;
- for (int chunk = 0; chunk < n_chunks; ++chunk) {
- input_t x_vals_load[2 * kNElts] = {0};
- int seq_pos_idx_load[kNElts];
- if constexpr (kIsVecLoad) {
- Ktraits::BlockLoadVecT(smem_load_vec)
- .Load(reinterpret_cast<vec_t*>(x),
- *reinterpret_cast<vec_t(*)[1]>(&x_vals_load[kNElts]),
- (params.seqlen - chunk * kChunkSize) / kNElts);
- if (kHasSeqPosIdx)
- Ktraits::BlockLoadIndexVecT(smem_load_index_vec)
- .Load(reinterpret_cast<int4*>(seq_pos_idx),
- *reinterpret_cast<int4(*)[Ktraits::kNLoadsIndex]>(
- seq_pos_idx_load),
- (params.seqlen - chunk * kChunkSize) / kNElts *
- Ktraits::kNLoadsIndex);
- } else {
- __syncthreads();
- Ktraits::BlockLoadT(smem_load).Load(
- x, *reinterpret_cast<input_t(*)[kNElts]>(&x_vals_load[kNElts]),
- params.seqlen - chunk * kChunkSize);
- if (kHasSeqPosIdx)
- Ktraits::BlockLoadIndexT(smem_load_index)
- .Load(seq_pos_idx, seq_pos_idx_load,
- (params.seqlen - chunk * kChunkSize), 0);
- }
- x += kChunkSize;
- if (kHasSeqPosIdx) seq_pos_idx += kChunkSize;
- __syncthreads();
- // Thread kNThreads - 1 don't write yet, so that thread 0 can read
- // the last elements of the previous chunk.
- if (tidx < kNThreads - 1) {
- smem_exchange[tidx] = reinterpret_cast<vec_t*>(x_vals_load)[1];
- }
- __syncthreads();
- reinterpret_cast<vec_t*>(x_vals_load)[0] =
- smem_exchange[tidx > 0 ? tidx - 1 : kNThreads - 1];
- __syncthreads();
- // Now thread kNThreads - 1 can write the last elements of the current
- // chunk.
- if (tidx == kNThreads - 1) {
- smem_exchange[tidx] = reinterpret_cast<vec_t*>(x_vals_load)[1];
- }
- float x_vals[2 * kNElts];
- #pragma unroll
- for (int i = 0; i < 2 * kNElts; ++i) {
- x_vals[i] = float(x_vals_load[i]);
- }
- float out_vals[kNElts];
- #pragma unroll
- for (int i = 0; i < kNElts; ++i) {
- out_vals[i] = bias_val;
- int w = 0;
- if (kHasSeqPosIdx) {
- if (seq_pos_idx_load[i] < kWidth) {
- w = kWidth - seq_pos_idx_load[i] - 1;
- }
- }
- for (; w < kWidth; ++w) {
- out_vals[i] += weight_vals[w] * x_vals[kNElts + i - (kWidth - w - 1)];
- }
- }
- if (params.silu_activation) {
- #pragma unroll
- for (int i = 0; i < kNElts; ++i) {
- out_vals[i] = out_vals[i] / (1 + expf(-out_vals[i]));
- }
- }
- input_t out_vals_store[kNElts];
- #pragma unroll
- for (int i = 0; i < kNElts; ++i) {
- out_vals_store[i] = out_vals[i];
- }
- if constexpr (kIsVecLoad) {
- Ktraits::BlockStoreVecT(smem_store_vec)
- .Store(reinterpret_cast<vec_t*>(out),
- reinterpret_cast<vec_t(&)[1]>(out_vals_store),
- (params.seqlen - chunk * kChunkSize) / kNElts);
- } else {
- Ktraits::BlockStoreT(smem_store)
- .Store(out, out_vals_store, params.seqlen - chunk * kChunkSize);
- }
- out += kChunkSize;
- }
- }
- template <int kNThreads, int kWidth, typename input_t, typename weight_t>
- void causal_conv1d_fwd_launch(ConvParamsBase& params, cudaStream_t stream) {
- static constexpr int kNElts = sizeof(input_t) == 4 ? 4 : 8;
- BOOL_SWITCH(params.seq_pos_idx_ptr != nullptr, kHasSeqPosIdx, [&] {
- BOOL_SWITCH(params.seqlen % kNElts == 0, kIsVecLoad, [&] {
- using Ktraits =
- Causal_conv1d_fwd_kernel_traits<kNThreads, kWidth, kIsVecLoad,
- input_t, weight_t>;
- constexpr int kSmemSize = Ktraits::kSmemSize;
- dim3 grid(params.batch, params.dim);
- auto kernel = &causal_conv1d_fwd_kernel<Ktraits, kHasSeqPosIdx>;
- if (kSmemSize >= 48 * 1024) {
- C10_CUDA_CHECK(cudaFuncSetAttribute(
- kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
- }
- kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
- C10_CUDA_KERNEL_LAUNCH_CHECK();
- });
- });
- }
- template <typename input_t, typename weight_t>
- void causal_conv1d_fwd_cuda(ConvParamsBase& params, cudaStream_t stream) {
- if (params.width == 2) {
- causal_conv1d_fwd_launch<128, 2, input_t, weight_t>(params, stream);
- } else if (params.width == 3) {
- causal_conv1d_fwd_launch<128, 3, input_t, weight_t>(params, stream);
- } else if (params.width == 4) {
- causal_conv1d_fwd_launch<128, 4, input_t, weight_t>(params, stream);
- }
- }
- template <int kNThreads_, int kWidth_, int kChunkSizeL_, bool kIsVecLoad_,
- typename input_t_, typename weight_t_>
- struct Causal_conv1d_channellast_fwd_kernel_traits {
- // The cache line is 128 bytes, and we try to read 16 bytes per thread.
- // So we have 8 threads per "row", so 32 or 64 elements in the channel
- // dimension. That leaves 4 columns per warp, and so 16 columns per block
- // (assuming each block has 128 threads). Each each load is 16 x 32|64
- // elements in the L x C dimensions.
- using input_t = input_t_;
- using weight_t = weight_t_;
- static constexpr int kNThreads = kNThreads_;
- static_assert(kNThreads % 32 == 0);
- static constexpr int kNWarps = kNThreads / 32;
- static constexpr int kWidth = kWidth_;
- static constexpr int kChunkSizeL = kChunkSizeL_;
- static constexpr int kNBytes = sizeof(input_t);
- static_assert(kNBytes == 2 || kNBytes == 4);
- static constexpr int kNElts = kNBytes == 4 ? 4 : 8;
- static constexpr int kNEltsPerRow = 128 / kNBytes;
- static constexpr int kNThreadsPerRow =
- kNEltsPerRow / kNElts; // Always 8 for now
- static_assert(kNThreadsPerRow * kNBytes * kNElts == 128);
- static constexpr int kNColsPerWarp =
- 32 / kNThreadsPerRow; // Always 4 for now
- static_assert(kNColsPerWarp * kNThreadsPerRow == 32);
- static constexpr int kNColsPerLoad = kNColsPerWarp * kNWarps;
- static constexpr int kNLoads = kChunkSizeL / kNColsPerLoad;
- static_assert(kNLoads * kNColsPerLoad == kChunkSizeL);
- static constexpr bool kIsVecLoad = kIsVecLoad_;
- using vec_t = typename BytesToType<kNBytes * kNElts>::Type;
- // using BlockLoadT = cub::BlockLoad<input_t, kNThreads, kNItems,
- // cub::BLOCK_LOAD_WARP_TRANSPOSE>; using BlockStoreT =
- // cub::BlockStore<input_t, kNThreads, kNItems,
- // cub::BLOCK_STORE_WARP_TRANSPOSE>; static constexpr int kSmemSize =
- // std::max({sizeof(typename BlockLoadT::TempStorage),
- // sizeof(typename
- // BlockStoreT::TempStorage)});
- // static constexpr int kSmemSize = kChunkSizeL * kNEltsPerRow * kNBytes;
- };
- template <typename Ktraits, bool kHasSeqIdx>
- __global__
- __launch_bounds__(Ktraits::kNThreads) void causal_conv1d_channellast_fwd_kernel(
- ConvParamsBase params) {
- constexpr int kWidth = Ktraits::kWidth;
- constexpr int kNThreads = Ktraits::kNThreads;
- constexpr int kNElts = Ktraits::kNElts;
- constexpr int kNThreadsPerC = Ktraits::kNThreadsPerRow;
- constexpr int kLPerLoad = Ktraits::kNColsPerLoad;
- constexpr int kChunkSizeL = Ktraits::kChunkSizeL;
- constexpr int kChunkSizeC = Ktraits::kNEltsPerRow;
- using input_t = typename Ktraits::input_t;
- using vec_t = typename Ktraits::vec_t;
- using weight_t = typename Ktraits::weight_t;
- // Shared memory.
- __shared__ input_t x_smem[kWidth - 1 + kChunkSizeL][kChunkSizeC + kNElts];
- const int batch_id = blockIdx.x;
- const int chunk_l_id = blockIdx.y;
- const int chunk_c_id = blockIdx.z;
- const int tid = threadIdx.x;
- const int l_idx = tid / kNThreadsPerC;
- const int c_idx = tid % kNThreadsPerC;
- input_t* x = reinterpret_cast<input_t*>(params.x_ptr) +
- batch_id * params.x_batch_stride +
- (chunk_l_id * kChunkSizeL + l_idx) * params.x_l_stride +
- chunk_c_id * kChunkSizeC + c_idx * kNElts;
- weight_t* weight = reinterpret_cast<weight_t*>(params.weight_ptr) +
- chunk_c_id * kChunkSizeC * params.weight_c_stride;
- input_t* out = reinterpret_cast<input_t*>(params.out_ptr) +
- batch_id * params.out_batch_stride +
- (chunk_l_id * kChunkSizeL + l_idx) * params.out_l_stride +
- chunk_c_id * kChunkSizeC + c_idx * kNElts;
- [[maybe_unused]] int* seq_idx =
- !kHasSeqIdx ? nullptr
- : reinterpret_cast<int*>(params.seq_idx_ptr) +
- batch_id * params.seqlen + chunk_l_id * kChunkSizeL;
- input_t* initial_states =
- params.initial_states_ptr == nullptr || chunk_l_id > 0
- ? nullptr
- : reinterpret_cast<input_t*>(params.initial_states_ptr) +
- batch_id * params.initial_states_batch_stride +
- l_idx * params.initial_states_l_stride +
- chunk_c_id * kChunkSizeC + c_idx * kNElts;
- // The last L-chunk will also have enough info to write to final states, since
- // it also contain a few x values from the previous L-chunk.
- input_t* final_states =
- params.final_states_ptr == nullptr || chunk_l_id < gridDim.y - 1
- ? nullptr
- : reinterpret_cast<input_t*>(params.final_states_ptr) +
- batch_id * params.final_states_batch_stride +
- l_idx * params.final_states_l_stride +
- chunk_c_id * kChunkSizeC + c_idx * kNElts;
- #pragma unroll
- for (int l = 0; l < Ktraits::kNLoads; ++l) {
- input_t x_vals_load[kNElts] = {0};
- if (chunk_l_id * kChunkSizeL + l * kLPerLoad + l_idx < params.seqlen &&
- chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
- reinterpret_cast<vec_t*>(x_vals_load)[0] =
- *reinterpret_cast<vec_t*>(x + l * kLPerLoad * params.x_l_stride);
- }
- reinterpret_cast<vec_t*>(
- x_smem[kWidth - 1 + l * kLPerLoad + l_idx])[c_idx] =
- reinterpret_cast<vec_t*>(x_vals_load)[0];
- }
- // Load the elements from the previous chunk that are needed for convolution.
- if (l_idx < kWidth - 1) {
- input_t x_vals_load[kNElts] = {0};
- if (chunk_l_id * kChunkSizeL + l_idx - (kWidth - 1) >= 0 &&
- chunk_l_id * kChunkSizeL + l_idx - (kWidth - 1) < params.seqlen &&
- chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
- reinterpret_cast<vec_t*>(x_vals_load)[0] =
- *reinterpret_cast<vec_t*>(x - (kWidth - 1) * params.x_l_stride);
- } else if (initial_states != nullptr &&
- chunk_l_id * kChunkSizeL + l_idx - (kWidth - 1) < 0 &&
- chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
- reinterpret_cast<vec_t*>(x_vals_load)[0] =
- *reinterpret_cast<vec_t*>(initial_states);
- }
- reinterpret_cast<vec_t*>(x_smem[l_idx])[c_idx] =
- reinterpret_cast<vec_t*>(x_vals_load)[0];
- }
- __syncthreads();
- if (final_states != nullptr && l_idx < kWidth - 1 &&
- chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
- // x_smem[0] contains element at index chunk_l_id * kChunkSizeL - (kWidth -
- // 1) So last few elements (index params.seqlen - kWidth + 1 + l_idx) are
- // stored in x_smem[params.seqlen - kWidth + 1 + l_idx - (chunk_l_id *
- // kChunkSizeL - kWidth + 1)][c_idx]
- *reinterpret_cast<vec_t*>(final_states) = reinterpret_cast<vec_t*>(
- x_smem[params.seqlen + l_idx - chunk_l_id * kChunkSizeL])[c_idx];
- }
- constexpr int kLPerThread =
- std::min(kChunkSizeL * kChunkSizeC / kNThreads, kChunkSizeL);
- static_assert(kLPerThread * kNThreads == kChunkSizeL * kChunkSizeC);
- constexpr int kNThreadsPerRow = kChunkSizeL / kLPerThread;
- static_assert(kNThreadsPerRow * kLPerThread == kChunkSizeL);
- // kChunkSizeL, kLPerThread, kNThreadsPerRow should be powers of 2 for
- // simplicity
- static_assert((kChunkSizeL & (kChunkSizeL - 1)) == 0);
- static_assert((kLPerThread & (kLPerThread - 1)) == 0);
- static_assert((kNThreadsPerRow & (kNThreadsPerRow - 1)) == 0);
- static_assert(kNThreadsPerRow <= 32);
- const int row_idx = tid / kNThreadsPerRow;
- const int col_idx = tid % kNThreadsPerRow;
- float bias_val =
- params.bias_ptr == nullptr ||
- chunk_c_id * kChunkSizeC + row_idx >= params.dim
- ? 0.f
- : float(reinterpret_cast<weight_t*>(
- params.bias_ptr)[chunk_c_id * kChunkSizeC + row_idx]);
- float weight_vals[kWidth] = {0};
- if (chunk_c_id * kChunkSizeC + row_idx < params.dim) {
- #pragma unroll
- for (int w = 0; w < kWidth; ++w) {
- weight_vals[w] = weight[row_idx * params.weight_c_stride +
- w * params.weight_width_stride];
- }
- }
- float x_vals[kWidth - 1 + kLPerThread];
- #pragma unroll
- for (int i = 0; i < kWidth - 1 + kLPerThread; ++i) {
- x_vals[i] = float(x_smem[col_idx * kLPerThread + i][row_idx]);
- }
- int seq_idx_thread[kWidth - 1 + kLPerThread];
- if constexpr (kHasSeqIdx) {
- #pragma unroll
- for (int i = 0; i < kWidth - 1 + kLPerThread; ++i) {
- seq_idx_thread[i] =
- chunk_l_id * kChunkSizeL + col_idx * kLPerThread + i - (kWidth - 1) >=
- 0
- ? seq_idx[col_idx * kLPerThread + i - (kWidth - 1)]
- : -1;
- }
- }
- float out_vals[kLPerThread];
- #pragma unroll
- for (int i = 0; i < kLPerThread; ++i) {
- out_vals[i] = bias_val;
- [[maybe_unused]] const int seq_idx_cur =
- !kHasSeqIdx ? 0 : seq_idx_thread[i + kWidth - 1];
- #pragma unroll
- for (int w = 0; w < kWidth; ++w) {
- if constexpr (!kHasSeqIdx) {
- out_vals[i] += weight_vals[w] * x_vals[i + w];
- } else {
- out_vals[i] += seq_idx_thread[i + w] == seq_idx_cur
- ? weight_vals[w] * x_vals[i + w]
- : 0.f;
- }
- }
- if (params.silu_activation) {
- out_vals[i] = out_vals[i] / (1 + expf(-out_vals[i]));
- }
- }
- __syncthreads();
- #pragma unroll
- for (int i = 0; i < kLPerThread; ++i) {
- x_smem[col_idx * kLPerThread + i][row_idx] = out_vals[i];
- }
- __syncthreads();
- #pragma unroll
- for (int l = 0; l < Ktraits::kNLoads; ++l) {
- input_t out_vals_store[kNElts];
- reinterpret_cast<vec_t*>(out_vals_store)[0] =
- reinterpret_cast<vec_t*>(x_smem[l * kLPerLoad + l_idx])[c_idx];
- if (chunk_l_id * kChunkSizeL + l * kLPerLoad + l_idx < params.seqlen &&
- chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
- *reinterpret_cast<vec_t*>(out + l * kLPerLoad * params.out_l_stride) =
- reinterpret_cast<vec_t*>(out_vals_store)[0];
- }
- }
- }
- template <int kNThreads, int kWidth, typename input_t, typename weight_t>
- void causal_conv1d_channellast_fwd_launch(ConvParamsBase& params,
- cudaStream_t stream) {
- BOOL_SWITCH(params.seq_idx_ptr != nullptr, kHasSeqIdx, [&] {
- using Ktraits =
- Causal_conv1d_channellast_fwd_kernel_traits<kNThreads, kWidth, 64, true,
- input_t, weight_t>;
- // constexpr int kSmemSize = Ktraits::kSmemSize;
- constexpr int kChunkSizeL = Ktraits::kChunkSizeL;
- constexpr int kChunkSizeC = Ktraits::kNEltsPerRow;
- const int n_chunks_L = (params.seqlen + kChunkSizeL - 1) / kChunkSizeL;
- const int n_chunks_C = (params.dim + kChunkSizeC - 1) / kChunkSizeC;
- dim3 grid(params.batch, n_chunks_L, n_chunks_C);
- dim3 block(Ktraits::kNThreads);
- auto kernel = &causal_conv1d_channellast_fwd_kernel<Ktraits, kHasSeqIdx>;
- // if (kSmemSize >= 48 * 1024) {
- // C10_CUDA_CHECK(cudaFuncSetAttribute(
- // kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
- // }
- // kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
- kernel<<<grid, Ktraits::kNThreads, 0, stream>>>(params);
- C10_CUDA_KERNEL_LAUNCH_CHECK();
- });
- }
- template <typename input_t, typename weight_t>
- void causal_conv1d_channellast_fwd_cuda(ConvParamsBase& params,
- cudaStream_t stream) {
- if (params.width == 2) {
- causal_conv1d_channellast_fwd_launch<128, 2, input_t, weight_t>(params,
- stream);
- } else if (params.width == 3) {
- causal_conv1d_channellast_fwd_launch<128, 3, input_t, weight_t>(params,
- stream);
- } else if (params.width == 4) {
- causal_conv1d_channellast_fwd_launch<128, 4, input_t, weight_t>(params,
- stream);
- }
- }
- template void causal_conv1d_fwd_cuda<float, float>(ConvParamsBase& params,
- cudaStream_t stream);
- template void causal_conv1d_fwd_cuda<at::Half, float>(ConvParamsBase& params,
- cudaStream_t stream);
- template void causal_conv1d_fwd_cuda<at::BFloat16, float>(
- ConvParamsBase& params, cudaStream_t stream);
- template void causal_conv1d_fwd_cuda<float, at::Half>(ConvParamsBase& params,
- cudaStream_t stream);
- template void causal_conv1d_fwd_cuda<at::Half, at::Half>(ConvParamsBase& params,
- cudaStream_t stream);
- template void causal_conv1d_fwd_cuda<at::BFloat16, at::Half>(
- ConvParamsBase& params, cudaStream_t stream);
- template void causal_conv1d_fwd_cuda<float, at::BFloat16>(
- ConvParamsBase& params, cudaStream_t stream);
- template void causal_conv1d_fwd_cuda<at::Half, at::BFloat16>(
- ConvParamsBase& params, cudaStream_t stream);
- template void causal_conv1d_fwd_cuda<at::BFloat16, at::BFloat16>(
- ConvParamsBase& params, cudaStream_t stream);
- template void causal_conv1d_channellast_fwd_cuda<float, float>(
- ConvParamsBase& params, cudaStream_t stream);
- template void causal_conv1d_channellast_fwd_cuda<at::Half, float>(
- ConvParamsBase& params, cudaStream_t stream);
- template void causal_conv1d_channellast_fwd_cuda<at::BFloat16, float>(
- ConvParamsBase& params, cudaStream_t stream);
- template void causal_conv1d_channellast_fwd_cuda<float, at::Half>(
- ConvParamsBase& params, cudaStream_t stream);
- template void causal_conv1d_channellast_fwd_cuda<at::Half, at::Half>(
- ConvParamsBase& params, cudaStream_t stream);
- template void causal_conv1d_channellast_fwd_cuda<at::BFloat16, at::Half>(
- ConvParamsBase& params, cudaStream_t stream);
- template void causal_conv1d_channellast_fwd_cuda<float, at::BFloat16>(
- ConvParamsBase& params, cudaStream_t stream);
- template void causal_conv1d_channellast_fwd_cuda<at::Half, at::BFloat16>(
- ConvParamsBase& params, cudaStream_t stream);
- template void causal_conv1d_channellast_fwd_cuda<at::BFloat16, at::BFloat16>(
- ConvParamsBase& params, cudaStream_t stream);
- ///////
- template <int kNThreads_, int kWidth_, typename input_t_, typename weight_t_>
- struct Causal_conv1d_update_kernel_traits {
- using input_t = input_t_;
- using weight_t = weight_t_;
- static constexpr int kNThreads = kNThreads_;
- static constexpr int kWidth = kWidth_;
- static constexpr int kNBytes = sizeof(input_t);
- static_assert(kNBytes == 2 || kNBytes == 4);
- };
- template <typename Ktraits>
- __global__
- __launch_bounds__(Ktraits::kNThreads) void causal_conv1d_update_kernel(
- ConvParamsBase params) {
- constexpr int kWidth = Ktraits::kWidth;
- constexpr int kNThreads = Ktraits::kNThreads;
- using input_t = typename Ktraits::input_t;
- using weight_t = typename Ktraits::weight_t;
- const int tidx = threadIdx.x;
- const int batch_id = blockIdx.x;
- const int channel_id = blockIdx.y * kNThreads + tidx;
- input_t* x = reinterpret_cast<input_t*>(params.x_ptr) +
- batch_id * params.x_batch_stride +
- channel_id * params.x_c_stride;
- input_t* conv_state = reinterpret_cast<input_t*>(params.conv_state_ptr) +
- batch_id * params.conv_state_batch_stride +
- channel_id * params.conv_state_c_stride;
- weight_t* weight = reinterpret_cast<weight_t*>(params.weight_ptr) +
- channel_id * params.weight_c_stride;
- input_t* out = reinterpret_cast<input_t*>(params.out_ptr) +
- batch_id * params.out_batch_stride +
- channel_id * params.out_c_stride;
- float bias_val =
- params.bias_ptr == nullptr || channel_id >= params.dim
- ? 0.f
- : float(reinterpret_cast<weight_t*>(params.bias_ptr)[channel_id]);
- float weight_vals[kWidth] = {0};
- if (channel_id < params.dim) {
- #pragma unroll
- for (int i = 0; i < kWidth; ++i) {
- weight_vals[i] = float(weight[i * params.weight_width_stride]);
- }
- }
- float x_vals[kWidth] = {0};
- if (channel_id < params.dim) {
- #pragma unroll
- for (int i = 0; i < kWidth - 1; ++i) {
- x_vals[i] = float(conv_state[(i + 1) * params.conv_state_l_stride]);
- }
- x_vals[kWidth - 1] = float(x[0]);
- #pragma unroll
- for (int i = 0; i < kWidth; ++i) {
- conv_state[i * params.conv_state_l_stride] = input_t(x_vals[i]);
- }
- }
- float out_val = bias_val;
- #pragma unroll
- for (int i = 0; i < kWidth; ++i) {
- out_val += weight_vals[i] * x_vals[i];
- }
- if (params.silu_activation) {
- out_val = out_val / (1 + expf(-out_val));
- }
- if (channel_id < params.dim) {
- out[0] = input_t(out_val);
- }
- }
- template <int kNThreads, int kWidth, typename input_t, typename weight_t>
- void causal_conv1d_update_launch(ConvParamsBase& params, cudaStream_t stream) {
- using Ktraits =
- Causal_conv1d_update_kernel_traits<kNThreads, kWidth, input_t, weight_t>;
- dim3 grid(params.batch, (params.dim + kNThreads - 1) / kNThreads);
- auto kernel = &causal_conv1d_update_kernel<Ktraits>;
- kernel<<<grid, Ktraits::kNThreads, 0, stream>>>(params);
- C10_CUDA_KERNEL_LAUNCH_CHECK();
- }
- template <typename input_t, typename weight_t>
- void causal_conv1d_update_cuda(ConvParamsBase& params, cudaStream_t stream) {
- if (params.width == 2) {
- causal_conv1d_update_launch<64, 2, input_t, weight_t>(params, stream);
- } else if (params.width == 3) {
- causal_conv1d_update_launch<64, 3, input_t, weight_t>(params, stream);
- } else if (params.width == 4) {
- causal_conv1d_update_launch<64, 4, input_t, weight_t>(params, stream);
- }
- }
- template void causal_conv1d_update_cuda<float, float>(ConvParamsBase& params,
- cudaStream_t stream);
- template void causal_conv1d_update_cuda<at::Half, float>(ConvParamsBase& params,
- cudaStream_t stream);
- template void causal_conv1d_update_cuda<at::BFloat16, float>(
- ConvParamsBase& params, cudaStream_t stream);
- template void causal_conv1d_update_cuda<float, at::Half>(ConvParamsBase& params,
- cudaStream_t stream);
- template void causal_conv1d_update_cuda<at::Half, at::Half>(
- ConvParamsBase& params, cudaStream_t stream);
- template void causal_conv1d_update_cuda<at::BFloat16, at::Half>(
- ConvParamsBase& params, cudaStream_t stream);
- template void causal_conv1d_update_cuda<float, at::BFloat16>(
- ConvParamsBase& params, cudaStream_t stream);
- template void causal_conv1d_update_cuda<at::Half, at::BFloat16>(
- ConvParamsBase& params, cudaStream_t stream);
- template void causal_conv1d_update_cuda<at::BFloat16, at::BFloat16>(
- ConvParamsBase& params, cudaStream_t stream);
|