test_tensorizer.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344
  1. import gc
  2. import json
  3. import os
  4. import pathlib
  5. import subprocess
  6. from unittest.mock import MagicMock, patch
  7. import openai
  8. import pytest
  9. import torch
  10. from tensorizer import EncryptionParams
  11. from aphrodite import SamplingParams
  12. from aphrodite.engine.args_tools import EngineArgs
  13. # yapf: disable
  14. from aphrodite.modeling.model_loader.tensorizer import (
  15. TensorizerConfig, TensorSerializer, is_aphrodite_tensorized,
  16. load_with_tensorizer, open_stream, serialize_aphrodite_model,
  17. tensorize_aphrodite_model)
  18. from ..conftest import AphroditeRunner
  19. from ..utils import RemoteOpenAIServer
  20. from .conftest import retry_until_skip
  21. # yapf conflicts with isort for this docstring
  22. prompts = [
  23. "Hello, my name is",
  24. "The president of the United States is",
  25. "The capital of France is",
  26. "The future of AI is",
  27. ]
  28. # Create a sampling params object.
  29. sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)
  30. model_ref = "facebook/opt-125m"
  31. tensorize_model_for_testing_script = os.path.join(
  32. os.path.dirname(__file__), "tensorize_aphrodite_model_for_testing.py")
  33. def is_curl_installed():
  34. try:
  35. subprocess.check_call(['curl', '--version'])
  36. return True
  37. except (subprocess.CalledProcessError, FileNotFoundError):
  38. return False
  39. def get_torch_model(aphrodite_runner: AphroditeRunner):
  40. return aphrodite_runner \
  41. .model \
  42. .llm_engine \
  43. .model_executor \
  44. .driver_worker \
  45. .model_runner \
  46. .model
  47. def write_keyfile(keyfile_path: str):
  48. encryption_params = EncryptionParams.random()
  49. pathlib.Path(keyfile_path).parent.mkdir(parents=True, exist_ok=True)
  50. with open(keyfile_path, 'wb') as f:
  51. f.write(encryption_params.key)
  52. @patch('aphrodite.modeling.model_loader.tensorizer.TensorizerAgent')
  53. def test_load_with_tensorizer(mock_agent, tensorizer_config):
  54. mock_linear_method = MagicMock()
  55. mock_agent_instance = mock_agent.return_value
  56. mock_agent_instance.deserialize.return_value = MagicMock()
  57. result = load_with_tensorizer(tensorizer_config,
  58. quant_method=mock_linear_method)
  59. mock_agent.assert_called_once_with(tensorizer_config,
  60. quant_method=mock_linear_method)
  61. mock_agent_instance.deserialize.assert_called_once()
  62. assert result == mock_agent_instance.deserialize.return_value
  63. @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
  64. def test_can_deserialize_s3(aphrodite_runner):
  65. model_ref = "EleutherAI/pythia-1.4b"
  66. tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors"
  67. with aphrodite_runner(model_ref,
  68. load_format="tensorizer",
  69. model_loader_extra_config=TensorizerConfig(
  70. tensorizer_uri=tensorized_path,
  71. num_readers=1,
  72. s3_endpoint="object.ord1.coreweave.com",
  73. )) as loaded_hf_model:
  74. deserialized_outputs = loaded_hf_model.generate(prompts,
  75. sampling_params)
  76. # noqa: E501
  77. assert deserialized_outputs
  78. @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
  79. def test_deserialized_encrypted_aphrodite_model_has_same_outputs(
  80. aphrodite_runner, tmp_path):
  81. with aphrodite_runner(model_ref) as aphrodite_model:
  82. model_path = tmp_path / (model_ref + ".tensors")
  83. key_path = tmp_path / (model_ref + ".key")
  84. write_keyfile(key_path)
  85. outputs = aphrodite_model.generate(prompts, sampling_params)
  86. config_for_serializing = TensorizerConfig(
  87. tensorizer_uri=model_path,
  88. encryption_keyfile=key_path
  89. )
  90. serialize_aphrodite_model(get_torch_model(aphrodite_model),
  91. config_for_serializing)
  92. config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path,
  93. encryption_keyfile=key_path)
  94. with aphrodite_runner(
  95. model_ref,
  96. load_format="tensorizer",
  97. model_loader_extra_config=config_for_deserializing) as loaded_aphrodite_model: # noqa: E501
  98. deserialized_outputs = loaded_aphrodite_model.generate(prompts,
  99. sampling_params)
  100. # noqa: E501
  101. assert outputs == deserialized_outputs
  102. def test_deserialized_hf_model_has_same_outputs(hf_runner, aphrodite_runner,
  103. tmp_path):
  104. with hf_runner(model_ref) as hf_model:
  105. model_path = tmp_path / (model_ref + ".tensors")
  106. max_tokens = 50
  107. outputs = hf_model.generate_greedy(prompts, max_tokens=max_tokens)
  108. with open_stream(model_path, "wb+") as stream:
  109. serializer = TensorSerializer(stream)
  110. serializer.write_module(hf_model.model)
  111. with aphrodite_runner(model_ref,
  112. load_format="tensorizer",
  113. model_loader_extra_config=TensorizerConfig(
  114. tensorizer_uri=model_path,
  115. num_readers=1,
  116. )) as loaded_hf_model:
  117. deserialized_outputs = loaded_hf_model.generate_greedy(
  118. prompts, max_tokens=max_tokens)
  119. assert outputs == deserialized_outputs
  120. def test_aphrodite_model_can_load_with_lora(aphrodite_runner, tmp_path):
  121. from huggingface_hub import snapshot_download
  122. from examples.offline_inference.lora_aphrodite_engine import (
  123. create_test_prompts, process_requests)
  124. model_ref = "meta-llama/Llama-2-7b-hf"
  125. lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
  126. test_prompts = create_test_prompts(lora_path)
  127. # Serialize model before deserializing and binding LoRA adapters
  128. with aphrodite_runner(model_ref, ) as aphrodite_model:
  129. model_path = tmp_path / (model_ref + ".tensors")
  130. serialize_aphrodite_model(get_torch_model(aphrodite_model),
  131. TensorizerConfig(tensorizer_uri=model_path))
  132. with aphrodite_runner(
  133. model_ref,
  134. load_format="tensorizer",
  135. model_loader_extra_config=TensorizerConfig(
  136. tensorizer_uri=model_path,
  137. num_readers=1,
  138. ),
  139. enable_lora=True,
  140. max_loras=1,
  141. max_lora_rank=8,
  142. max_cpu_loras=2,
  143. max_num_seqs=50,
  144. max_model_len=1000,
  145. ) as loaded_aphrodite_model:
  146. process_requests(loaded_aphrodite_model.model.llm_engine, test_prompts)
  147. assert loaded_aphrodite_model
  148. def test_load_without_tensorizer_load_format(aphrodite_runner):
  149. model = None
  150. with pytest.raises(ValueError):
  151. model = aphrodite_runner(
  152. model_ref,
  153. model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
  154. del model
  155. gc.collect()
  156. torch.cuda.empty_cache()
  157. @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
  158. def test_openai_apiserver_with_tensorizer(aphrodite_runner, tmp_path):
  159. ## Serialize model
  160. with aphrodite_runner(model_ref, ) as aphrodite_model:
  161. model_path = tmp_path / (model_ref + ".tensors")
  162. serialize_aphrodite_model(get_torch_model(aphrodite_model),
  163. TensorizerConfig(tensorizer_uri=model_path))
  164. model_loader_extra_config = {
  165. "tensorizer_uri": str(model_path),
  166. }
  167. ## Start OpenAI API server
  168. openai_args = [
  169. "--dtype", "float16", "--load-format",
  170. "tensorizer", "--model-loader-extra-config",
  171. json.dumps(model_loader_extra_config),
  172. ]
  173. with RemoteOpenAIServer(model_ref, openai_args) as server:
  174. print("Server ready.")
  175. client = server.get_client()
  176. completion = client.completions.create(model=model_ref,
  177. prompt="Hello, my name is",
  178. max_tokens=5,
  179. temperature=0.0)
  180. assert completion.id is not None
  181. assert len(completion.choices) == 1
  182. assert len(completion.choices[0].text) >= 5
  183. assert completion.choices[0].finish_reason == "length"
  184. assert completion.usage == openai.types.CompletionUsage(
  185. completion_tokens=5, prompt_tokens=6, total_tokens=11)
  186. def test_raise_value_error_on_invalid_load_format(aphrodite_runner):
  187. model = None
  188. with pytest.raises(ValueError):
  189. model = aphrodite_runner(
  190. model_ref,
  191. load_format="safetensors",
  192. model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
  193. del model
  194. gc.collect()
  195. torch.cuda.empty_cache()
  196. @pytest.mark.skipif(torch.cuda.device_count() < 2,
  197. reason="Requires 2 GPUs")
  198. def test_tensorizer_with_tp_path_without_template(aphrodite_runner):
  199. with pytest.raises(ValueError):
  200. model_ref = "EleutherAI/pythia-1.4b"
  201. tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors"
  202. aphrodite_runner(
  203. model_ref,
  204. load_format="tensorizer",
  205. model_loader_extra_config=TensorizerConfig(
  206. tensorizer_uri=tensorized_path,
  207. num_readers=1,
  208. s3_endpoint="object.ord1.coreweave.com",
  209. ),
  210. tensor_parallel_size=2,
  211. disable_custom_all_reduce=True,
  212. )
  213. @pytest.mark.skipif(torch.cuda.device_count() < 2,
  214. reason="Requires 2 GPUs")
  215. def test_deserialized_encrypted_aphrodite_model_with_tp_has_same_outputs(
  216. aphrodite_runner, tmp_path):
  217. model_ref = "EleutherAI/pythia-1.4b"
  218. # record outputs from un-sharded un-tensorized model
  219. with aphrodite_runner(
  220. model_ref,
  221. disable_custom_all_reduce=True,
  222. enforce_eager=True,
  223. ) as base_model:
  224. outputs = base_model.generate(prompts, sampling_params)
  225. base_model.model.llm_engine.model_executor.shutdown()
  226. # load model with two shards and serialize with encryption
  227. model_path = str(tmp_path / (model_ref + "-%02d.tensors"))
  228. key_path = tmp_path / (model_ref + ".key")
  229. tensorizer_config = TensorizerConfig(
  230. tensorizer_uri=model_path,
  231. encryption_keyfile=key_path,
  232. )
  233. tensorize_aphrodite_model(
  234. engine_args=EngineArgs(
  235. model=model_ref,
  236. tensor_parallel_size=2,
  237. disable_custom_all_reduce=True,
  238. enforce_eager=True,
  239. ),
  240. tensorizer_config=tensorizer_config,
  241. )
  242. assert os.path.isfile(model_path % 0), "Serialization subprocess failed"
  243. assert os.path.isfile(model_path % 1), "Serialization subprocess failed"
  244. with aphrodite_runner(
  245. model_ref,
  246. tensor_parallel_size=2,
  247. load_format="tensorizer",
  248. disable_custom_all_reduce=True,
  249. enforce_eager=True,
  250. model_loader_extra_config=tensorizer_config
  251. ) as loaded_aphrodite_model:
  252. deserialized_outputs = loaded_aphrodite_model.generate(prompts,
  253. sampling_params)
  254. assert outputs == deserialized_outputs
  255. @retry_until_skip(3)
  256. def test_aphrodite_tensorized_model_has_same_outputs(
  257. aphrodite_runner, tmp_path):
  258. gc.collect()
  259. torch.cuda.empty_cache()
  260. model_ref = "facebook/opt-125m"
  261. model_path = tmp_path / (model_ref + ".tensors")
  262. config = TensorizerConfig(tensorizer_uri=str(model_path))
  263. with aphrodite_runner(model_ref) as aphrodite_model:
  264. outputs = aphrodite_model.generate(prompts, sampling_params)
  265. serialize_aphrodite_model(get_torch_model(aphrodite_model), config)
  266. assert is_aphrodite_tensorized(config)
  267. with aphrodite_runner(model_ref,
  268. load_format="tensorizer",
  269. model_loader_extra_config=config
  270. ) as loaded_aphrodite_model:
  271. deserialized_outputs = loaded_aphrodite_model.generate(prompts,
  272. sampling_params)
  273. # noqa: E501
  274. assert outputs == deserialized_outputs