1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889 |
- #pragma once
- __device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
- bool pred = true) {
- const int BYTES = 16;
- uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
- asm volatile(
- "{\n"
- " .reg .pred p;\n"
- " setp.ne.b32 p, %0, 0;\n"
- " @p cp.async.cg.shared.global [%1], [%2], %3;\n"
- "}\n" ::"r"((int)pred),
- "r"(smem), "l"(glob_ptr), "n"(BYTES));
- }
- __device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
- const int BYTES = 16;
- uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
- asm volatile(
- "{\n"
- " cp.async.cg.shared.global [%0], [%1], %2;\n"
- "}\n" ::"r"(smem),
- "l"(glob_ptr), "n"(BYTES));
- }
- __device__ inline void cp_async_fence() {
- asm volatile("cp.async.commit_group;\n" ::);
- }
- template <int n>
- __device__ inline void cp_async_wait() {
- asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
- }
- __device__ inline void barrier_acquire(int* lock, int count) {
- if (threadIdx.x == 0) {
- int state = -1;
- do
-
-
- asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
- : "=r"(state)
- : "l"(lock));
- while (state != count);
- }
- __syncthreads();
- }
- __device__ inline void barrier_release(int* lock, bool reset = false) {
- __syncthreads();
- if (threadIdx.x == 0) {
- if (reset) {
- lock[0] = 0;
- return;
- }
- int val = 1;
-
-
- asm volatile("fence.acq_rel.gpu;\n");
- asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
- :
- : "l"(lock), "r"(val));
- }
- }
|