decoder_xqa_runner.h 2.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. /*
  2. * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #pragma once
  17. #include "decoder_xqa_impl_precompiled.h"
  18. #include "xqa_params.h"
  19. #include "decoder_xqa_impl_common.h"
  20. class DecoderXQARunner {
  21. public:
  22. DecoderXQARunner(const XQADataType data_type, int num_heads, int num_kv_heads,
  23. int head_size, bool multi_block_mode);
  24. ~DecoderXQARunner();
  25. /**
  26. * \param[in] xqaParams the xqaParams to be tested against.
  27. */
  28. bool shouldUse(XQAParams const& xqaParams);
  29. size_t getWorkspaceSize(int max_num_tokens);
  30. void prepare(XQAParams const& xqa_params) { this->prepareForRun(xqa_params); }
  31. void dispatch(XQAParams const& xqa_params,
  32. KVCacheListParams const& kv_cache_buffer,
  33. cudaStream_t const& stream) {
  34. // sync_check_cuda_error(); //TODO
  35. this->run(xqa_params, kv_cache_buffer, stream);
  36. }
  37. class Resource;
  38. static Resource* getResourceGlobal();
  39. private:
  40. void prepareForRun(XQAParams const& xqa_params);
  41. void run(XQAParams const& xqa_params,
  42. KVCacheListParams const& kv_cache_buffer,
  43. cudaStream_t const& stream);
  44. static constexpr int kMaxBeamWidth = 4;
  45. XQADataType mDataType;
  46. int mNumHeads;
  47. int mNumKVHeads;
  48. int mHeadSize;
  49. bool mMultiBlockMode;
  50. int mMultiProcessorCount;
  51. // std::unique_ptr<DecoderXQAImpl> mJITImpl,
  52. std::unique_ptr<DecoderXQAImpl> mPrecompiledImpl;
  53. DecoderXQAImpl* getImplFromXQAParams(XQAParams const& params);
  54. friend DecoderXQAImplPrecompiled;
  55. };