xqa_params.h 1.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344
  1. /*
  2. * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #pragma once
  17. #include "decoder_xqa_common.h"
  18. using XQADataType = Data_type;
  19. struct XQAParams {
  20. XQADataType data_type = DATA_TYPE_FP16;
  21. XQADataType kv_cache_data_type = DATA_TYPE_FP16;
  22. void* output = nullptr;
  23. void const* qHeads = nullptr;
  24. float const* kv_scale_quant_orig = nullptr;
  25. uint32_t* semaphores = nullptr;
  26. void* workspaces = nullptr;
  27. uint32_t batch_size = 0;
  28. int32_t beam_width = 0;
  29. int32_t generation_input_length;
  30. int32_t layer_idx = 0;
  31. int32_t num_q_heads = 0;
  32. int32_t num_kv_heads = 0;
  33. int32_t head_size = 0;
  34. int timestep = 0;
  35. // Paged KV cache parameters.
  36. bool paged_kv_cache = true; // always true
  37. int tokens_per_block;
  38. int max_blocks_per_sequence;
  39. bool multi_block_mode;
  40. bool multi_query_tokens = false;
  41. };