test_swap.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. import torch
  2. from aphrodite.common.sequence import ExecuteModelRequest
  3. from aphrodite.common.utils import (get_distributed_init_method, get_ip,
  4. get_open_port)
  5. from aphrodite.engine.args_tools import EngineArgs
  6. from aphrodite.worker.worker import Worker
  7. def test_swap() -> None:
  8. # Configure the engine.
  9. engine_args = EngineArgs(model="facebook/opt-125m",
  10. dtype="half",
  11. load_format="dummy")
  12. engine_config = engine_args.create_engine_config()
  13. engine_config.cache_config.num_gpu_blocks = 1000
  14. engine_config.cache_config.num_cpu_blocks = 1000
  15. # Create the worker.
  16. distributed_init_method = get_distributed_init_method(
  17. get_ip(), get_open_port())
  18. worker = Worker(
  19. model_config=engine_config.model_config,
  20. parallel_config=engine_config.parallel_config,
  21. scheduler_config=engine_config.scheduler_config,
  22. device_config=engine_config.device_config,
  23. cache_config=engine_config.cache_config,
  24. load_config=engine_config.load_config,
  25. local_rank=0,
  26. rank=0,
  27. distributed_init_method=distributed_init_method,
  28. is_driver_worker=True,
  29. )
  30. # Initialize the worker.
  31. worker.init_device()
  32. worker.load_model()
  33. worker.initialize_cache(
  34. num_gpu_blocks=engine_config.cache_config.num_gpu_blocks,
  35. num_cpu_blocks=engine_config.cache_config.num_cpu_blocks)
  36. # Randomly initialize the cache.
  37. gpu_cache = worker.cache_engine[0].gpu_cache
  38. cpu_cache = worker.cache_engine[0].cpu_cache
  39. num_layers = len(gpu_cache)
  40. for i in range(num_layers):
  41. gpu_key_cache, gpu_value_cache = gpu_cache[i]
  42. gpu_key_cache.random_()
  43. gpu_value_cache.random_()
  44. cpu_key_cache, cpu_value_cache = cpu_cache[i]
  45. cpu_key_cache.random_()
  46. cpu_value_cache.random_()
  47. allclose = lambda a, b: torch.allclose(
  48. a.cuda(), b.cuda(), rtol=0.0, atol=0.0)
  49. # Test swap out.
  50. blocks_to_swap_out = [(3, 72), (56, 35), (84, 34)]
  51. execute_model_req = ExecuteModelRequest(
  52. seq_group_metadata_list=[],
  53. blocks_to_swap_in=[],
  54. blocks_to_swap_out=blocks_to_swap_out,
  55. blocks_to_copy=[],
  56. )
  57. worker.execute_model(execute_model_req=execute_model_req)
  58. for i in range(num_layers):
  59. gpu_key_cache, gpu_value_cache = gpu_cache[i]
  60. cpu_key_cache, cpu_value_cache = cpu_cache[i]
  61. for src, dst in blocks_to_swap_out:
  62. assert allclose(gpu_key_cache[src], cpu_key_cache[dst])
  63. assert allclose(gpu_value_cache[src], cpu_value_cache[dst])
  64. # Test swap in.
  65. execute_model_req.blocks_to_swap_out = []
  66. execute_model_req.blocks_to_swap_in = [
  67. (19, 45),
  68. (67, 23),
  69. (12, 78),
  70. (40, 99),
  71. (1, 71),
  72. ]
  73. worker.execute_model(execute_model_req=execute_model_req)
  74. for i in range(num_layers):
  75. gpu_key_cache, gpu_value_cache = gpu_cache[i]
  76. cpu_key_cache, cpu_value_cache = cpu_cache[i]
  77. for src, dst in execute_model_req.blocks_to_swap_in:
  78. assert allclose(gpu_key_cache[dst], cpu_key_cache[src])
  79. assert allclose(gpu_value_cache[dst], cpu_value_cache[src])