/****************************************************************************** * Copyright (c) 2024, Tri Dao. ******************************************************************************/ #pragma once #include #include //////////////////////////////////////////////////////////////////////////////////////////////////// struct ConvParamsBase { using index_t = uint32_t; int batch, dim, seqlen, width; bool silu_activation; index_t x_batch_stride; index_t x_c_stride; index_t x_l_stride; index_t weight_c_stride; index_t weight_width_stride; index_t out_batch_stride; index_t out_c_stride; index_t out_l_stride; index_t conv_state_batch_stride; index_t conv_state_c_stride; index_t conv_state_l_stride; // Common data pointers. void* __restrict__ x_ptr; void* __restrict__ weight_ptr; void* __restrict__ bias_ptr; void* __restrict__ out_ptr; void* __restrict__ conv_state_ptr; void* __restrict__ seq_idx_ptr; void* __restrict__ seq_pos_idx_ptr; // No __restrict__ since initial_states could be the same as final_states. void* initial_states_ptr; index_t initial_states_batch_stride; index_t initial_states_l_stride; index_t initial_states_c_stride; void* final_states_ptr; index_t final_states_batch_stride; index_t final_states_l_stride; index_t final_states_c_stride; }; template struct BytesToType {}; template <> struct BytesToType<16> { using Type = uint4; static_assert(sizeof(Type) == 16); }; template <> struct BytesToType<8> { using Type = uint64_t; static_assert(sizeof(Type) == 8); }; template <> struct BytesToType<4> { using Type = uint32_t; static_assert(sizeof(Type) == 4); }; template <> struct BytesToType<2> { using Type = uint16_t; static_assert(sizeof(Type) == 2); }; template <> struct BytesToType<1> { using Type = uint8_t; static_assert(sizeof(Type) == 1); }; //////////////////////////////////////////////////////////////////////////////////////////////////// template struct SumOp { __device__ inline T operator()(T const& x, T const& y) { return x + y; } }; template struct Allreduce { static_assert(THREADS == 32 || THREADS == 16 || THREADS == 8 || THREADS == 4); template static __device__ inline T run(T x, Operator& op) { constexpr int OFFSET = THREADS / 2; x = op(x, __shfl_xor_sync(uint32_t(-1), x, OFFSET)); return Allreduce::run(x, op); } }; template <> struct Allreduce<2> { template static __device__ inline T run(T x, Operator& op) { x = op(x, __shfl_xor_sync(uint32_t(-1), x, 1)); return x; } };