utils.py 850 B

123456789101112131415161718
  1. from loguru import logger
  2. def check_block_size_valid(num_gpu_blocks, block_size, max_model_len) -> None:
  3. if num_gpu_blocks <= 0:
  4. raise ValueError("No available memory for the cache blocks. "
  5. "Try increasing `gpu_memory_utilization` when "
  6. "initializing the engine.")
  7. max_seq_len = block_size * num_gpu_blocks
  8. logger.info(f"Maximum sequence length allowed in the cache: "
  9. f"{max_seq_len}")
  10. if max_model_len > max_seq_len:
  11. raise ValueError(
  12. f"The model's max seq len ({max_model_len}) "
  13. "is larger than the maximum number of tokens that can be "
  14. f"stored in KV cache ({max_seq_len}). Try increasing "
  15. "`gpu_memory_utilization` or decreasing `max_model_len` when "
  16. "initializing the engine.")