david
/
aphrodite-engine
mirror of https://github.com/PygmalionAI/aphrodite-engine


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344
							/*
 * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
#pragma once
#include "decoder_xqa_common.h"
using XQADataType = Data_type;

struct XQAParams {
  XQADataType data_type = DATA_TYPE_FP16;
  XQADataType kv_cache_data_type = DATA_TYPE_FP16;
  void* output = nullptr;
  void const* qHeads = nullptr;
  float const* kv_scale_quant_orig = nullptr;
  uint32_t* semaphores = nullptr;
  void* workspaces = nullptr;
  uint32_t batch_size = 0;
  int32_t beam_width = 0;

  int32_t generation_input_length;
  int32_t layer_idx = 0;
  int32_t num_q_heads = 0;
  int32_t num_kv_heads = 0;
  int32_t head_size = 0;
  int timestep = 0;

  // Paged KV cache parameters.
  bool paged_kv_cache = true;  // always true
  int tokens_per_block;
  int max_blocks_per_sequence;
  bool multi_block_mode;
  bool multi_query_tokens = false;
};