123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187 |
- #pragma once
- #include <cuda.h>
- #include <vector>
- #include "cutlass/fast_math.h"
- using index_t = int64_t;
-
- void *__restrict__ q_ptr;
- void *__restrict__ k_ptr;
- void *__restrict__ v_ptr;
-
- index_t q_batch_stride;
- index_t k_batch_stride;
- index_t v_batch_stride;
- index_t q_row_stride;
- index_t k_row_stride;
- index_t v_row_stride;
- index_t q_head_stride;
- index_t k_head_stride;
- index_t v_head_stride;
-
- int h, h_k;
-
-
- int h_h_k_ratio;
- };
-
- void * __restrict__ o_ptr;
- void * __restrict__ oaccum_ptr;
-
- index_t o_batch_stride;
- index_t o_row_stride;
- index_t o_head_stride;
-
- void * __restrict__ p_ptr;
-
- void * __restrict__ softmax_lse_ptr;
- void * __restrict__ softmax_lseaccum_ptr;
-
- int b, seqlen_q, seqlen_k, seqlen_knew, d, seqlen_q_rounded, seqlen_k_rounded, d_rounded, rotary_dim, total_q, total_k;
-
- float scale_softmax;
- float scale_softmax_log2;
- uint32_t scale_softmax_log2_half2;
-
- int * __restrict__ cu_seqlens_q;
- int * __restrict__ cu_seqlens_k;
-
- int * __restrict__ seqused_k;
- int *__restrict__ blockmask;
-
- void * __restrict__ knew_ptr;
- void * __restrict__ vnew_ptr;
-
- index_t knew_batch_stride;
- index_t vnew_batch_stride;
- index_t knew_row_stride;
- index_t vnew_row_stride;
- index_t knew_head_stride;
- index_t vnew_head_stride;
-
- void * __restrict__ rotary_cos_ptr;
- void * __restrict__ rotary_sin_ptr;
-
- int * __restrict__ cache_batch_idx;
-
- int * __restrict__ block_table;
- index_t block_table_batch_stride;
- int page_block_size;
-
- float p_dropout;
-
-
- uint8_t p_dropout_in_uint8_t;
-
- float rp_dropout;
- float scale_softmax_rp_dropout;
-
- int window_size_left, window_size_right;
-
- uint64_t * rng_state;
- bool is_bf16;
- bool is_e4m3;
- bool is_causal;
-
-
- bool is_seqlens_k_cumulative;
- bool is_rotary_interleaved;
- int num_splits;
- void * __restrict__ alibi_slopes_ptr;
- index_t alibi_slopes_batch_stride;
- bool unpadded_lse;
- int * __restrict__ tile_count_semaphore;
- float * __restrict__ descale_q_ptr;
- float * __restrict__ descale_k_ptr;
- float * __restrict__ descale_v_ptr;
- };
-
- void *__restrict__ do_ptr;
- void *__restrict__ dq_ptr;
- void *__restrict__ dk_ptr;
- void *__restrict__ dv_ptr;
-
- void *__restrict__ dq_accum_ptr;
- void *__restrict__ dk_accum_ptr;
- void *__restrict__ dv_accum_ptr;
-
-
-
-
-
-
- index_t do_batch_stride;
- index_t do_row_stride;
- index_t do_head_stride;
- index_t dq_batch_stride;
- index_t dk_batch_stride;
- index_t dv_batch_stride;
- index_t dq_row_stride;
- index_t dk_row_stride;
- index_t dv_row_stride;
- index_t dq_head_stride;
- index_t dk_head_stride;
- index_t dv_head_stride;
-
- void *__restrict__ dsoftmax_sum;
- void *__restrict__ softmax_lse_log2_ptr;
- int *__restrict__ dq_semaphore;
- bool deterministic;
- index_t dq_accum_split_stride;
- };
- template<typename T, int Headdim> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream);
|