1
0

reduction.cuh 1.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. /*
  2. * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/reduce_kernel_utils.cuh
  3. * Copyright (c) 2023 The PygmalionAI team.
  4. * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
  5. *
  6. * Licensed under the Apache License, Version 2.0 (the "License");
  7. * you may not use this file except in compliance with the License.
  8. * You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. #pragma once
  19. namespace aphrodite {
  20. template<typename T>
  21. __inline__ __device__ T blockReduceSum(T val)
  22. {
  23. static __shared__ T shared[32];
  24. int lane = threadIdx.x & 0x1f;
  25. int wid = threadIdx.x >> 5;
  26. val = warpReduceSum<T>(val);
  27. if (lane == 0)
  28. shared[wid] = val;
  29. __syncthreads();
  30. val = (threadIdx.x < (blockDim.x / 32.f)) ? shared[lane] : (T)(0.0f);
  31. val = warpReduceSum<T>(val);
  32. return val;
  33. }
  34. }