tensorize_aphrodite_model.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237
  1. import argparse
  2. import dataclasses
  3. import json
  4. import uuid
  5. from aphrodite import LLM, envs
  6. from aphrodite.engine.args_tools import EngineArgs
  7. from aphrodite.modeling.model_loader.tensorizer import (
  8. TensorizerArgs, TensorizerConfig, tensorize_aphrodite_model)
  9. # yapf conflicts with isort for this docstring
  10. # yapf: disable
  11. """
  12. tensorize_aphrodite_model.py is a script that can be used to serialize and
  13. deserialize Aphrodite models. These models can be loaded using tensorizer
  14. to the GPU extremely quickly over an HTTP/HTTPS endpoint, an S3 endpoint,
  15. or locally. Tensor encryption and decryption is also supported, although
  16. libsodium must be installed to use it. Install aphrodite with tensorizer
  17. support using `pip install aphrodite[tensorizer]`. To learn more about
  18. tensorizer, visit https://github.com/coreweave/tensorizer
  19. To serialize a model, install Aphrodite from source, then run something
  20. like this from the root level of this repository:
  21. python -m examples.tensorize_aphrodite_model \
  22. --model facebook/opt-125m \
  23. serialize \
  24. --serialized-directory s3://my-bucket \
  25. --suffix v1
  26. Which downloads the model from HuggingFace, loads it into Aphrodite, serializes
  27. it, and saves it to your S3 bucket. A local directory can also be used. This
  28. assumes your S3 credentials are specified as environment variables
  29. in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and
  30. `S3_ENDPOINT_URL`. To provide S3 credentials directly, you can provide
  31. `--s3-access-key-id` and `--s3-secret-access-key`, as well as `--s3-endpoint`
  32. as CLI args to this script.
  33. You can also encrypt the model weights with a randomly-generated key by
  34. providing a `--keyfile` argument.
  35. To deserialize a model, you can run something like this from the root
  36. level of this repository:
  37. python -m examples.tensorize_aphrodite_model \
  38. --model EleutherAI/gpt-j-6B \
  39. --dtype float16 \
  40. deserialize \
  41. --path-to-tensors s3://my-bucket/aphrodite/EleutherAI/gpt-j-6B/v1/model.tensors
  42. Which downloads the model tensors from your S3 bucket and deserializes them.
  43. You can also provide a `--keyfile` argument to decrypt the model weights if
  44. they were serialized with encryption.
  45. To support distributed tensor-parallel models, each model shard will be
  46. serialized to a separate file. The tensorizer_uri is then specified as a string
  47. template with a format specifier such as '%03d' that will be rendered with the
  48. shard's rank. Sharded models serialized with this script will be named as
  49. model-rank-%03d.tensors
  50. For more information on the available arguments for serializing, run
  51. `python -m examples.tensorize_aphrodite_model serialize --help`.
  52. Or for deserializing:
  53. `python -m examples.tensorize_aphrodite_model deserialize --help`.
  54. Once a model is serialized, tensorizer can be invoked with the `LLM` class
  55. directly to load models:
  56. llm = LLM(model="facebook/opt-125m",
  57. load_format="tensorizer",
  58. model_loader_extra_config=TensorizerConfig(
  59. tensorizer_uri = path_to_tensors,
  60. num_readers=3,
  61. )
  62. )
  63. A serialized model can be used during model loading for the Aphrodite OpenAI
  64. inference server. `model_loader_extra_config` is exposed as the CLI arg
  65. `--model-loader-extra-config`, and accepts a JSON string literal of the
  66. TensorizerConfig arguments desired.
  67. In order to see all of the available arguments usable to configure
  68. loading with tensorizer that are given to `TensorizerConfig`, run:
  69. `python -m examples.tensorize_aphrodite_model deserialize --help`
  70. under the `tensorizer options` section. These can also be used for
  71. deserialization in this example script, although `--tensorizer-uri` and
  72. `--path-to-tensors` are functionally the same in this case.
  73. """
  74. def parse_args():
  75. parser = argparse.ArgumentParser(
  76. description="An example script that can be used to serialize and "
  77. "deserialize Aphrodite models. These models "
  78. "can be loaded using tensorizer directly to the GPU "
  79. "extremely quickly. Tensor encryption and decryption is "
  80. "also supported, although libsodium must be installed to "
  81. "use it.")
  82. parser = EngineArgs.add_cli_args(parser)
  83. subparsers = parser.add_subparsers(dest='command')
  84. serialize_parser = subparsers.add_parser(
  85. 'serialize', help="Serialize a model to `--serialized-directory`")
  86. serialize_parser.add_argument(
  87. "--suffix",
  88. type=str,
  89. required=False,
  90. help=(
  91. "The suffix to append to the serialized model directory, which is "
  92. "used to construct the location of the serialized model tensors, "
  93. "e.g. if `--serialized-directory` is `s3://my-bucket/` and "
  94. "`--suffix` is `v1`, the serialized model tensors will be "
  95. "saved to "
  96. "`s3://my-bucket/aphrodite/EleutherAI/gpt-j-6B/v1/model.tensors`. "
  97. "If none is provided, a random UUID will be used."))
  98. serialize_parser.add_argument(
  99. "--serialized-directory",
  100. type=str,
  101. required=True,
  102. help="The directory to serialize the model to. "
  103. "This can be a local directory or S3 URI. The path to where the "
  104. "tensors are saved is a combination of the supplied `dir` and model "
  105. "reference ID. For instance, if `dir` is the serialized directory, "
  106. "and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "
  107. "be saved to `dir/aphrodite/EleutherAI/gpt-j-6B/suffix/model.tensors`, "
  108. "where `suffix` is given by `--suffix` or a random UUID if not "
  109. "provided.")
  110. serialize_parser.add_argument(
  111. "--keyfile",
  112. type=str,
  113. required=False,
  114. help=("Encrypt the model weights with a randomly-generated binary key,"
  115. " and save the key at this path"))
  116. deserialize_parser = subparsers.add_parser(
  117. 'deserialize',
  118. help=("Deserialize a model from `--path-to-tensors`"
  119. " to verify it can be loaded and used."))
  120. deserialize_parser.add_argument(
  121. "--path-to-tensors",
  122. type=str,
  123. required=True,
  124. help="The local path or S3 URI to the model tensors to deserialize. ")
  125. deserialize_parser.add_argument(
  126. "--keyfile",
  127. type=str,
  128. required=False,
  129. help=("Path to a binary key to use to decrypt the model weights,"
  130. " if the model was serialized with encryption"))
  131. TensorizerArgs.add_cli_args(deserialize_parser)
  132. return parser.parse_args()
  133. def deserialize():
  134. llm = LLM(model=args.model,
  135. load_format="tensorizer",
  136. tensor_parallel_size=args.tensor_parallel_size,
  137. model_loader_extra_config=tensorizer_config
  138. )
  139. return llm
  140. if __name__ == '__main__':
  141. args = parse_args()
  142. s3_access_key_id = (getattr(args, 's3_access_key_id', None)
  143. or envs.S3_ACCESS_KEY_ID)
  144. s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
  145. or envs.S3_SECRET_ACCESS_KEY)
  146. s3_endpoint = (getattr(args, 's3_endpoint', None)
  147. or envs.S3_ENDPOINT_URL)
  148. credentials = {
  149. "s3_access_key_id": s3_access_key_id,
  150. "s3_secret_access_key": s3_secret_access_key,
  151. "s3_endpoint": s3_endpoint
  152. }
  153. model_ref = args.model
  154. model_name = model_ref.split("/")[1]
  155. keyfile = args.keyfile if args.keyfile else None
  156. if args.model_loader_extra_config:
  157. config = json.loads(args.model_loader_extra_config)
  158. tensorizer_args = \
  159. TensorizerConfig(**config)._construct_tensorizer_args()
  160. tensorizer_args.tensorizer_uri = args.path_to_tensors
  161. else:
  162. tensorizer_args = None
  163. if args.command == "serialize":
  164. eng_args_dict = {f.name: getattr(args, f.name) for f in
  165. dataclasses.fields(EngineArgs)}
  166. engine_args = EngineArgs.from_cli_args(
  167. argparse.Namespace(**eng_args_dict)
  168. )
  169. input_dir = args.serialized_directory.rstrip('/')
  170. suffix = args.suffix if args.suffix else uuid.uuid4().hex
  171. base_path = f"{input_dir}/aphrodite/{model_ref}/{suffix}"
  172. if engine_args.tensor_parallel_size > 1:
  173. model_path = f"{base_path}/model-rank-%03d.tensors"
  174. else:
  175. model_path = f"{base_path}/model.tensors"
  176. tensorizer_config = TensorizerConfig(
  177. tensorizer_uri=model_path,
  178. encryption_keyfile=keyfile,
  179. **credentials)
  180. tensorize_aphrodite_model(engine_args, tensorizer_config)
  181. elif args.command == "deserialize":
  182. if not tensorizer_args:
  183. tensorizer_config = TensorizerConfig(
  184. tensorizer_uri=args.path_to_tensors,
  185. encryption_keyfile = keyfile,
  186. **credentials
  187. )
  188. deserialize()
  189. else:
  190. raise ValueError("Either serialize or deserialize must be specified.")