cuda_buffers.cuh 1.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
  1. // Adapted from turboderp exllama: https://github.com/turboderp/exllama
  2. #ifndef _cuda_buffers_cuh
  3. #define _cuda_buffers_cuh
  4. #include <cuda_runtime.h>
  5. #include <cuda_fp16.h>
  6. #include <cstdint>
  7. #include <cstdio>
  8. const int CUDA_MAX_DEVICES = 16;
  9. // #ifndef _cuda_buffers_cu
  10. // extern __constant__ half2 q4_table[16][256];
  11. // #endif
  12. class CudaBuffers
  13. {
  14. public:
  15. int device;
  16. half* temp_state; // [max_hidden_rows * intermediate_size]
  17. int temp_state_size;
  18. half* temp_dq; // size of largest quant tensor * 8
  19. cudaStream_t alt_stream_1;
  20. cudaStream_t alt_stream_2;
  21. cudaStream_t alt_stream_3;
  22. cudaEvent_t alt_stream_1_done;
  23. cudaEvent_t alt_stream_2_done;
  24. cudaEvent_t alt_stream_3_done;
  25. CudaBuffers
  26. (
  27. int _device,
  28. int _temp_state_size,
  29. half* _temp_state,
  30. half* _temp_dq
  31. );
  32. ~CudaBuffers();
  33. };
  34. CudaBuffers* get_buffers(const int device_index);
  35. void prepare_buffers_cuda
  36. (
  37. int _device,
  38. int _temp_state_size,
  39. half* _temp_state,
  40. half* _temp_dq
  41. );
  42. void cleanup_buffers_cuda();
  43. #endif