tensorize_aphrodite_model.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. import argparse
  2. import dataclasses
  3. import json
  4. import os
  5. import uuid
  6. from functools import partial
  7. from tensorizer import stream_io
  8. from aphrodite import LLM
  9. from aphrodite.distributed import (init_distributed_environment,
  10. initialize_model_parallel)
  11. from aphrodite.engine.aphrodite_engine import AphroditeEngine
  12. from aphrodite.engine.args_tools import EngineArgs
  13. from aphrodite.modeling.model_loader.tensorizer import (
  14. TensorizerArgs, TensorizerConfig, serialize_aphrodite_model)
  15. # yapf conflicts with isort for this docstring
  16. # yapf: disable
  17. """
  18. tensorize_aphrodite_model.py is a script that can be used to serialize and
  19. deserialize Aphrodite models. These models can be loaded using tensorizer
  20. to the GPU extremely quickly over an HTTP/HTTPS endpoint, an S3 endpoint,
  21. or locally. Tensor encryption and decryption is also supported, although
  22. libsodium must be installed to use it. Install aphrodite with tensorizer
  23. support using `pip install aphrodite[tensorizer]`. To learn more about
  24. tensorizer, visit https://github.com/coreweave/tensorizer
  25. To serialize a model, install Aphrodite from source, then run something
  26. like this from the root level of this repository:
  27. python -m examples.tensorize_aphrodite_model \
  28. --model facebook/opt-125m \
  29. serialize \
  30. --serialized-directory s3://my-bucket \
  31. --suffix v1
  32. Which downloads the model from HuggingFace, loads it into Aphrodite, serializes
  33. it, and saves it to your S3 bucket. A local directory can also be used. This
  34. assumes your S3 credentials are specified as environment variables
  35. in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and
  36. `S3_ENDPOINT_URL`. To provide S3 credentials directly, you can provide
  37. `--s3-access-key-id` and `--s3-secret-access-key`, as well as `--s3-endpoint`
  38. as CLI args to this script.
  39. You can also encrypt the model weights with a randomly-generated key by
  40. providing a `--keyfile` argument.
  41. To deserialize a model, you can run something like this from the root
  42. level of this repository:
  43. python -m examples.tensorize_aphrodite_model \
  44. --model EleutherAI/gpt-j-6B \
  45. --dtype float16 \
  46. deserialize \
  47. --path-to-tensors s3://my-bucket/aphrodite/EleutherAI/gpt-j-6B/v1/model.tensors
  48. Which downloads the model tensors from your S3 bucket and deserializes them.
  49. You can also provide a `--keyfile` argument to decrypt the model weights if
  50. they were serialized with encryption.
  51. For more information on the available arguments for serializing, run
  52. `python -m examples.tensorize_aphrodite_model serialize --help`.
  53. Or for deserializing:
  54. `python -m examples.tensorize_aphrodite_model deserialize --help`.
  55. Once a model is serialized, tensorizer can be invoked with the `LLM` class
  56. directly to load models:
  57. llm = LLM(model="facebook/opt-125m",
  58. load_format="tensorizer",
  59. model_loader_extra_config=TensorizerConfig(
  60. tensorizer_uri = path_to_tensors,
  61. num_readers=3,
  62. )
  63. )
  64. A serialized model can be used during model loading for the Aphrodite OpenAI
  65. inference server. `model_loader_extra_config` is exposed as the CLI arg
  66. `--model-loader-extra-config`, and accepts a JSON string literal of the
  67. TensorizerConfig arguments desired.
  68. In order to see all of the available arguments usable to configure
  69. loading with tensorizer that are given to `TensorizerConfig`, run:
  70. `python -m examples.tensorize_aphrodite_model deserialize --help`
  71. under the `tensorizer options` section. These can also be used for
  72. deserialization in this example script, although `--tensorizer-uri` and
  73. `--path-to-tensors` are functionally the same in this case.
  74. """
  75. def parse_args():
  76. parser = argparse.ArgumentParser(
  77. description="An example script that can be used to serialize and "
  78. "deserialize Aphrodite models. These models "
  79. "can be loaded using tensorizer directly to the GPU "
  80. "extremely quickly. Tensor encryption and decryption is "
  81. "also supported, although libsodium must be installed to "
  82. "use it.")
  83. parser = EngineArgs.add_cli_args(parser)
  84. subparsers = parser.add_subparsers(dest='command')
  85. serialize_parser = subparsers.add_parser(
  86. 'serialize', help="Serialize a model to `--serialized-directory`")
  87. serialize_parser.add_argument(
  88. "--suffix",
  89. type=str,
  90. required=False,
  91. help=(
  92. "The suffix to append to the serialized model directory, which is "
  93. "used to construct the location of the serialized model tensors, "
  94. "e.g. if `--serialized-directory` is `s3://my-bucket/` and "
  95. "`--suffix` is `v1`, the serialized model tensors will be "
  96. "saved to "
  97. "`s3://my-bucket/aphrodite/EleutherAI/gpt-j-6B/v1/model.tensors`. "
  98. "If none is provided, a random UUID will be used."))
  99. serialize_parser.add_argument(
  100. "--serialized-directory",
  101. type=str,
  102. required=True,
  103. help="The directory to serialize the model to. "
  104. "This can be a local directory or S3 URI. The path to where the "
  105. "tensors are saved is a combination of the supplied `dir` and model "
  106. "reference ID. For instance, if `dir` is the serialized directory, "
  107. "and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "
  108. "be saved to `dir/aphrodite/EleutherAI/gpt-j-6B/suffix/model.tensors`, "
  109. "where `suffix` is given by `--suffix` or a random UUID if not "
  110. "provided.")
  111. serialize_parser.add_argument(
  112. "--keyfile",
  113. type=str,
  114. required=False,
  115. help=("Encrypt the model weights with a randomly-generated binary key,"
  116. " and save the key at this path"))
  117. deserialize_parser = subparsers.add_parser(
  118. 'deserialize',
  119. help=("Deserialize a model from `--path-to-tensors`"
  120. " to verify it can be loaded and used."))
  121. deserialize_parser.add_argument(
  122. "--path-to-tensors",
  123. type=str,
  124. required=True,
  125. help="The local path or S3 URI to the model tensors to deserialize. ")
  126. deserialize_parser.add_argument(
  127. "--keyfile",
  128. type=str,
  129. required=False,
  130. help=("Path to a binary key to use to decrypt the model weights,"
  131. " if the model was serialized with encryption"))
  132. TensorizerArgs.add_cli_args(deserialize_parser)
  133. return parser.parse_args()
  134. def deserialize():
  135. llm = LLM(model=args.model,
  136. load_format="tensorizer",
  137. model_loader_extra_config=tensorizer_config
  138. )
  139. return llm
  140. args = parse_args()
  141. s3_access_key_id = (getattr(args, 's3_access_key_id', None)
  142. or os.environ.get("S3_ACCESS_KEY_ID", None))
  143. s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
  144. or os.environ.get("S3_SECRET_ACCESS_KEY", None))
  145. s3_endpoint = (getattr(args, 's3_endpoint', None)
  146. or os.environ.get("S3_ENDPOINT_URL", None))
  147. credentials = {
  148. "s3_access_key_id": s3_access_key_id,
  149. "s3_secret_access_key": s3_secret_access_key,
  150. "s3_endpoint": s3_endpoint
  151. }
  152. _read_stream, _write_stream = (partial(
  153. stream_io.open_stream,
  154. mode=mode,
  155. s3_access_key_id=s3_access_key_id,
  156. s3_secret_access_key=s3_secret_access_key,
  157. s3_endpoint=s3_endpoint,
  158. ) for mode in ("rb", "wb+"))
  159. model_ref = args.model
  160. model_name = model_ref.split("/")[1]
  161. os.environ["MASTER_ADDR"] = "127.0.0.1"
  162. os.environ["MASTER_PORT"] = "8080"
  163. init_distributed_environment(world_size=1, rank=0, local_rank=0)
  164. initialize_model_parallel()
  165. keyfile = args.keyfile if args.keyfile else None
  166. if args.model_loader_extra_config:
  167. config = json.loads(args.model_loader_extra_config)
  168. tensorizer_args = TensorizerConfig(**config)._construct_tensorizer_args()
  169. tensorizer_args.tensorizer_uri = args.path_to_tensors
  170. else:
  171. tensorizer_args = None
  172. if args.command == "serialize":
  173. eng_args_dict = {f.name: getattr(args, f.name) for f in
  174. dataclasses.fields(EngineArgs)}
  175. engine_args = EngineArgs.from_cli_args(argparse.Namespace(**eng_args_dict))
  176. engine = AphroditeEngine.from_engine_args(engine_args)
  177. input_dir = args.serialized_directory.rstrip('/')
  178. suffix = args.suffix if args.suffix else uuid.uuid4().hex
  179. base_path = f"{input_dir}/aphrodite/{model_ref}/{suffix}"
  180. model_path = f"{base_path}/model.tensors"
  181. tensorizer_config = TensorizerConfig(
  182. tensorizer_uri=model_path,
  183. **credentials)
  184. serialize_aphrodite_model(engine, tensorizer_config, keyfile)
  185. elif args.command == "deserialize":
  186. if not tensorizer_args:
  187. tensorizer_config = TensorizerConfig(
  188. tensorizer_uri=args.path_to_tensors,
  189. encryption_keyfile = keyfile,
  190. **credentials
  191. )
  192. deserialize()
  193. else:
  194. raise ValueError("Either serialize or deserialize must be specified.")