123456789101112131415161718 |
- from loguru import logger
- def check_block_size_valid(num_gpu_blocks, block_size, max_model_len) -> None:
- if num_gpu_blocks <= 0:
- raise ValueError("No available memory for the cache blocks. "
- "Try increasing `gpu_memory_utilization` when "
- "initializing the engine.")
- max_seq_len = block_size * num_gpu_blocks
- logger.info(f"Maximum sequence length allowed in the cache: "
- f"{max_seq_len}")
- if max_model_len > max_seq_len:
- raise ValueError(
- f"The model's max seq len ({max_model_len}) "
- "is larger than the maximum number of tokens that can be "
- f"stored in KV cache ({max_seq_len}). Try increasing "
- "`gpu_memory_utilization` or decreasing `max_model_len` when "
- "initializing the engine.")
|