tensorize_aphrodite_model.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238
  1. import argparse
  2. import dataclasses
  3. import json
  4. import uuid
  5. import aphrodite.common.envs as envs
  6. from aphrodite import LLM
  7. from aphrodite.engine.args_tools import EngineArgs
  8. from aphrodite.modeling.model_loader.tensorizer import (
  9. TensorizerArgs, TensorizerConfig, tensorize_aphrodite_model)
  10. # yapf conflicts with isort for this docstring
  11. # yapf: disable
  12. """
  13. tensorize_aphrodite_model.py is a script that can be used to serialize and
  14. deserialize Aphrodite models. These models can be loaded using tensorizer
  15. to the GPU extremely quickly over an HTTP/HTTPS endpoint, an S3 endpoint,
  16. or locally. Tensor encryption and decryption is also supported, although
  17. libsodium must be installed to use it. Install aphrodite with tensorizer
  18. support using `pip install aphrodite[tensorizer]`. To learn more about
  19. tensorizer, visit https://github.com/coreweave/tensorizer
  20. To serialize a model, install Aphrodite from source, then run something
  21. like this from the root level of this repository:
  22. python -m examples.tensorize_aphrodite_model \
  23. --model facebook/opt-125m \
  24. serialize \
  25. --serialized-directory s3://my-bucket \
  26. --suffix v1
  27. Which downloads the model from HuggingFace, loads it into Aphrodite, serializes
  28. it, and saves it to your S3 bucket. A local directory can also be used. This
  29. assumes your S3 credentials are specified as environment variables
  30. in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and
  31. `S3_ENDPOINT_URL`. To provide S3 credentials directly, you can provide
  32. `--s3-access-key-id` and `--s3-secret-access-key`, as well as `--s3-endpoint`
  33. as CLI args to this script.
  34. You can also encrypt the model weights with a randomly-generated key by
  35. providing a `--keyfile` argument.
  36. To deserialize a model, you can run something like this from the root
  37. level of this repository:
  38. python -m examples.tensorize_aphrodite_model \
  39. --model EleutherAI/gpt-j-6B \
  40. --dtype float16 \
  41. deserialize \
  42. --path-to-tensors s3://my-bucket/aphrodite/EleutherAI/gpt-j-6B/v1/model.tensors
  43. Which downloads the model tensors from your S3 bucket and deserializes them.
  44. You can also provide a `--keyfile` argument to decrypt the model weights if
  45. they were serialized with encryption.
  46. To support distributed tensor-parallel models, each model shard will be
  47. serialized to a separate file. The tensorizer_uri is then specified as a string
  48. template with a format specifier such as '%03d' that will be rendered with the
  49. shard's rank. Sharded models serialized with this script will be named as
  50. model-rank-%03d.tensors
  51. For more information on the available arguments for serializing, run
  52. `python -m examples.tensorize_aphrodite_model serialize --help`.
  53. Or for deserializing:
  54. `python -m examples.tensorize_aphrodite_model deserialize --help`.
  55. Once a model is serialized, tensorizer can be invoked with the `LLM` class
  56. directly to load models:
  57. llm = LLM(model="facebook/opt-125m",
  58. load_format="tensorizer",
  59. model_loader_extra_config=TensorizerConfig(
  60. tensorizer_uri = path_to_tensors,
  61. num_readers=3,
  62. )
  63. )
  64. A serialized model can be used during model loading for the Aphrodite OpenAI
  65. inference server. `model_loader_extra_config` is exposed as the CLI arg
  66. `--model-loader-extra-config`, and accepts a JSON string literal of the
  67. TensorizerConfig arguments desired.
  68. In order to see all of the available arguments usable to configure
  69. loading with tensorizer that are given to `TensorizerConfig`, run:
  70. `python -m examples.tensorize_aphrodite_model deserialize --help`
  71. under the `tensorizer options` section. These can also be used for
  72. deserialization in this example script, although `--tensorizer-uri` and
  73. `--path-to-tensors` are functionally the same in this case.
  74. """
  75. def parse_args():
  76. parser = argparse.ArgumentParser(
  77. description="An example script that can be used to serialize and "
  78. "deserialize Aphrodite models. These models "
  79. "can be loaded using tensorizer directly to the GPU "
  80. "extremely quickly. Tensor encryption and decryption is "
  81. "also supported, although libsodium must be installed to "
  82. "use it.")
  83. parser = EngineArgs.add_cli_args(parser)
  84. subparsers = parser.add_subparsers(dest='command')
  85. serialize_parser = subparsers.add_parser(
  86. 'serialize', help="Serialize a model to `--serialized-directory`")
  87. serialize_parser.add_argument(
  88. "--suffix",
  89. type=str,
  90. required=False,
  91. help=(
  92. "The suffix to append to the serialized model directory, which is "
  93. "used to construct the location of the serialized model tensors, "
  94. "e.g. if `--serialized-directory` is `s3://my-bucket/` and "
  95. "`--suffix` is `v1`, the serialized model tensors will be "
  96. "saved to "
  97. "`s3://my-bucket/aphrodite/EleutherAI/gpt-j-6B/v1/model.tensors`. "
  98. "If none is provided, a random UUID will be used."))
  99. serialize_parser.add_argument(
  100. "--serialized-directory",
  101. type=str,
  102. required=True,
  103. help="The directory to serialize the model to. "
  104. "This can be a local directory or S3 URI. The path to where the "
  105. "tensors are saved is a combination of the supplied `dir` and model "
  106. "reference ID. For instance, if `dir` is the serialized directory, "
  107. "and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "
  108. "be saved to `dir/aphrodite/EleutherAI/gpt-j-6B/suffix/model.tensors`, "
  109. "where `suffix` is given by `--suffix` or a random UUID if not "
  110. "provided.")
  111. serialize_parser.add_argument(
  112. "--keyfile",
  113. type=str,
  114. required=False,
  115. help=("Encrypt the model weights with a randomly-generated binary key,"
  116. " and save the key at this path"))
  117. deserialize_parser = subparsers.add_parser(
  118. 'deserialize',
  119. help=("Deserialize a model from `--path-to-tensors`"
  120. " to verify it can be loaded and used."))
  121. deserialize_parser.add_argument(
  122. "--path-to-tensors",
  123. type=str,
  124. required=True,
  125. help="The local path or S3 URI to the model tensors to deserialize. ")
  126. deserialize_parser.add_argument(
  127. "--keyfile",
  128. type=str,
  129. required=False,
  130. help=("Path to a binary key to use to decrypt the model weights,"
  131. " if the model was serialized with encryption"))
  132. TensorizerArgs.add_cli_args(deserialize_parser)
  133. return parser.parse_args()
  134. def deserialize():
  135. llm = LLM(model=args.model,
  136. load_format="tensorizer",
  137. tensor_parallel_size=args.tensor_parallel_size,
  138. model_loader_extra_config=tensorizer_config
  139. )
  140. return llm
  141. if __name__ == '__main__':
  142. args = parse_args()
  143. s3_access_key_id = (getattr(args, 's3_access_key_id', None)
  144. or envs.S3_ACCESS_KEY_ID)
  145. s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
  146. or envs.S3_SECRET_ACCESS_KEY)
  147. s3_endpoint = (getattr(args, 's3_endpoint', None)
  148. or envs.S3_ENDPOINT_URL)
  149. credentials = {
  150. "s3_access_key_id": s3_access_key_id,
  151. "s3_secret_access_key": s3_secret_access_key,
  152. "s3_endpoint": s3_endpoint
  153. }
  154. model_ref = args.model
  155. model_name = model_ref.split("/")[1]
  156. keyfile = args.keyfile if args.keyfile else None
  157. if args.model_loader_extra_config:
  158. config = json.loads(args.model_loader_extra_config)
  159. tensorizer_args = \
  160. TensorizerConfig(**config)._construct_tensorizer_args()
  161. tensorizer_args.tensorizer_uri = args.path_to_tensors
  162. else:
  163. tensorizer_args = None
  164. if args.command == "serialize":
  165. eng_args_dict = {f.name: getattr(args, f.name) for f in
  166. dataclasses.fields(EngineArgs)}
  167. engine_args = EngineArgs.from_cli_args(
  168. argparse.Namespace(**eng_args_dict)
  169. )
  170. input_dir = args.serialized_directory.rstrip('/')
  171. suffix = args.suffix if args.suffix else uuid.uuid4().hex
  172. base_path = f"{input_dir}/aphrodite/{model_ref}/{suffix}"
  173. if engine_args.tensor_parallel_size > 1:
  174. model_path = f"{base_path}/model-rank-%03d.tensors"
  175. else:
  176. model_path = f"{base_path}/model.tensors"
  177. tensorizer_config = TensorizerConfig(
  178. tensorizer_uri=model_path,
  179. encryption_keyfile=keyfile,
  180. **credentials)
  181. tensorize_aphrodite_model(engine_args, tensorizer_config)
  182. elif args.command == "deserialize":
  183. if not tensorizer_args:
  184. tensorizer_config = TensorizerConfig(
  185. tensorizer_uri=args.path_to_tensors,
  186. encryption_keyfile = keyfile,
  187. **credentials
  188. )
  189. deserialize()
  190. else:
  191. raise ValueError("Either serialize or deserialize must be specified.")