tokenizer.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302
  1. import os
  2. import tempfile
  3. from typing import List, Optional, Tuple, Union
  4. import gguf
  5. from transformers import (AutoTokenizer, PreTrainedTokenizer,
  6. PreTrainedTokenizerFast, LlamaTokenizer)
  7. from transformers.convert_slow_tokenizer import import_protobuf
  8. from loguru import logger
  9. from aphrodite.lora.request import LoRARequest
  10. from aphrodite.common.utils import make_async, LRUCache
  11. from aphrodite.transformers_utils.tokenizers import BaichuanTokenizer
  12. def convert_gguf_to_tokenizer(checkpoint):
  13. result = gguf.GGUFReader(checkpoint)
  14. # write vocab
  15. sentencepiece_model_pb2 = import_protobuf()
  16. vocab = sentencepiece_model_pb2.ModelProto()
  17. vocab_size = len(result.fields['tokenizer.ggml.token_type'].data)
  18. vocab.trainer_spec.model_type = 2 # BPE
  19. vocab.trainer_spec.vocab_size = vocab_size
  20. vocab.trainer_spec.byte_fallback = True
  21. vocab.normalizer_spec.remove_extra_whitespaces = False
  22. tokens = result.fields['tokenizer.ggml.tokens']
  23. scores = result.fields['tokenizer.ggml.scores']
  24. types = result.fields['tokenizer.ggml.token_type']
  25. for i in range(vocab_size):
  26. new_token = vocab.SentencePiece()
  27. new_token.piece = str(bytes(tokens.parts[tokens.data[i]]),
  28. encoding='utf-8')
  29. new_token.score = scores.parts[scores.data[i]]
  30. # llama.cpp tokentype is the same with sentencepiece token type
  31. new_token.type = int(types.parts[types.data[i]])
  32. vocab.pieces.append(new_token)
  33. with tempfile.NamedTemporaryFile(mode='wb', delete=False) as temp_file:
  34. temp_file.write(vocab.SerializeToString())
  35. temp_file_filename = temp_file.name
  36. tokenizer_args = {"vocab_file": temp_file_filename}
  37. if 'tokenizer.ggml.bos_token_id' in result.fields:
  38. tokenizer_args["bos_token"] = vocab.pieces[int(
  39. result.fields['tokenizer.ggml.bos_token_id'].parts[-1])].piece
  40. if 'tokenizer.ggml.eos_token_id' in result.fields:
  41. tokenizer_args["eos_token"] = vocab.pieces[int(
  42. result.fields['tokenizer.ggml.eos_token_id'].parts[-1])].piece
  43. if 'tokenizer.ggml.padding_token_id' in result.fields:
  44. tokenizer_args["pad_token"] = vocab.pieces[int(
  45. result.fields['tokenizer.ggml.padding_token_id'].parts[-1])].piece
  46. if 'tokenizer.ggml.unknown_token_id' in result.fields:
  47. tokenizer_args["unk_token"] = vocab.pieces[int(
  48. result.fields['tokenizer.ggml.unknown_token_id'].parts[-1])].piece
  49. if 'tokenizer.ggml.add_bos_token' in result.fields:
  50. tokenizer_args["add_bos_token"] = bool(
  51. result.fields['tokenizer.ggml.add_bos_token'].parts[-1])
  52. if 'tokenizer.ggml.add_eos_token' in result.fields:
  53. tokenizer_args["add_eos_token"] = bool(
  54. result.fields['tokenizer.ggml.add_eos_token'].parts[-1])
  55. if 'tokenizer.chat_template' in result.fields:
  56. tokenizer_args["chat_template"] = str(
  57. bytes(result.fields['tokenizer.chat_template'].parts[-1]))
  58. tokenizer = LlamaTokenizer(**tokenizer_args)
  59. os.unlink(temp_file_filename)
  60. return tokenizer
  61. def get_tokenizer(
  62. tokenizer_name: str,
  63. *args,
  64. tokenizer_mode: str = "auto",
  65. trust_remote_code: bool = False,
  66. tokenizer_revision: Optional[str] = None,
  67. **kwargs,
  68. ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
  69. """Gets a tokenizer for the given model name via Huggingface."""
  70. if tokenizer_name.endswith("gguf"):
  71. return convert_gguf_to_tokenizer(tokenizer_name)
  72. if tokenizer_mode == "slow":
  73. if kwargs.get("use_fast", False):
  74. raise ValueError(
  75. "Cannot use the fast tokenizer in slow tokenizer mode.")
  76. kwargs["use_fast"] = False
  77. try:
  78. tokenizer = AutoTokenizer.from_pretrained(
  79. tokenizer_name,
  80. *args,
  81. trust_remote_code=trust_remote_code,
  82. tokenizer_revision=tokenizer_revision,
  83. **kwargs)
  84. except ValueError as e:
  85. # If the error pertains to the tokenizer class not existing or not
  86. # currently being imported, suggest using the --trust-remote-code flag.
  87. if (not trust_remote_code and
  88. ("does not exist or is not currently imported." in str(e)
  89. or "requires you to execute the tokenizer file" in str(e))):
  90. err_msg = (
  91. "Failed to load the tokenizer. If the tokenizer is a custom "
  92. "tokenizer not yet available in the HuggingFace transformers "
  93. "library, consider setting `trust_remote_code=True` in LLM "
  94. "or using the `--trust-remote-code` flag in the CLI.")
  95. raise RuntimeError(err_msg) from e
  96. else:
  97. raise e
  98. except AttributeError as e:
  99. if "BaichuanTokenizer" in str(e):
  100. # This is for the error "'BaichuanTokenizer' object has no
  101. # attribute 'sp_model'".
  102. tokenizer = BaichuanTokenizer.from_pretrained(
  103. tokenizer_name,
  104. *args,
  105. trust_remote_code=trust_remote_code,
  106. tokenizer_revision=tokenizer_revision,
  107. **kwargs)
  108. else:
  109. raise e
  110. if not isinstance(tokenizer, PreTrainedTokenizerFast):
  111. logger.warning(
  112. "Using a slow tokenizer. This might cause a significant "
  113. "slowdown. Consider using a fast tokenizer instead.")
  114. return tokenizer
  115. def get_lora_tokenizer(lora_request: LoRARequest, *args,
  116. **kwargs) -> Optional[PreTrainedTokenizer]:
  117. if lora_request is None:
  118. return None
  119. try:
  120. tokenizer = get_tokenizer(lora_request.lora_local_path, *args,
  121. **kwargs)
  122. except OSError as e:
  123. # No tokenizer was found in the LoRA folder,
  124. # use base model tokenizer
  125. logger.warning(
  126. f"No tokenizer found in {lora_request.lora_local_path}, "
  127. "using base model tokenizer instead. "
  128. f"(Exception: {str(e)})")
  129. tokenizer = None
  130. return tokenizer
  131. get_lora_tokenizer_async = make_async(get_lora_tokenizer)
  132. class TokenizerGroup:
  133. """A group of tokenizers that can be used for LoRA adapters."""
  134. def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
  135. max_input_length: Optional[int], **tokenizer_config):
  136. self.tokenizer_id = tokenizer_id
  137. self.tokenizer_config = tokenizer_config
  138. self.enable_lora = enable_lora
  139. self.max_input_length = max_input_length
  140. self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config)
  141. if enable_lora:
  142. self.lora_tokenizers = LRUCache(capacity=max_num_seqs)
  143. else:
  144. self.lora_tokenizers = None
  145. def encode(self,
  146. prompt: str,
  147. request_id: Optional[str] = None,
  148. lora_request: Optional[LoRARequest] = None) -> List[int]:
  149. tokenizer = self.get_lora_tokenizer(lora_request)
  150. return tokenizer.encode(prompt)
  151. async def encode_async(
  152. self,
  153. prompt: str,
  154. request_id: Optional[str] = None,
  155. lora_request: Optional[LoRARequest] = None) -> List[int]:
  156. tokenizer = await self.get_lora_tokenizer_async(lora_request)
  157. return tokenizer.encode(prompt)
  158. def get_lora_tokenizer(
  159. self,
  160. lora_request: Optional[LoRARequest]) -> "PreTrainedTokenizer":
  161. if not lora_request or not self.enable_lora:
  162. return self.tokenizer
  163. if lora_request.lora_int_id not in self.lora_tokenizers:
  164. tokenizer = (get_lora_tokenizer(
  165. lora_request, **self.tokenizer_config) or self.tokenizer)
  166. self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
  167. return tokenizer
  168. else:
  169. return self.lora_tokenizers.get(lora_request.lora_int_id)
  170. async def get_lora_tokenizer_async(
  171. self,
  172. lora_request: Optional[LoRARequest]) -> "PreTrainedTokenizer":
  173. if not lora_request or not self.enable_lora:
  174. return self.tokenizer
  175. if lora_request.lora_int_id not in self.lora_tokenizers:
  176. tokenizer = (await get_lora_tokenizer_async(
  177. lora_request, **self.tokenizer_config) or self.tokenizer)
  178. self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
  179. return tokenizer
  180. else:
  181. return self.lora_tokenizers.get(lora_request.lora_int_id)
  182. def _convert_tokens_to_string_with_added_encoders(
  183. tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
  184. output_tokens: List[str],
  185. skip_special_tokens: bool,
  186. spaces_between_special_tokens: bool,
  187. ) -> str:
  188. # Adapted from
  189. # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921
  190. # NOTE: The following code is slow because it runs a for loop over
  191. # the output_tokens. In Python, running a for loop over a list can be slow
  192. # even when the loop body is very simple.
  193. sub_texts = []
  194. current_sub_text = []
  195. all_special_tokens = set(tokenizer.all_special_tokens)
  196. for token in output_tokens:
  197. if skip_special_tokens and token in all_special_tokens:
  198. continue
  199. if token in tokenizer.get_added_vocab():
  200. if current_sub_text:
  201. sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
  202. sub_texts.append(sub_text)
  203. current_sub_text = []
  204. sub_texts.append(token)
  205. else:
  206. current_sub_text.append(token)
  207. if current_sub_text:
  208. sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
  209. sub_texts.append(sub_text)
  210. if spaces_between_special_tokens:
  211. return " ".join(sub_texts)
  212. else:
  213. return "".join(sub_texts)
  214. # Based on
  215. # https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15
  216. # under Apache 2.0 license
  217. def detokenize_incrementally(
  218. tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
  219. all_input_ids: List[int],
  220. prev_tokens: Optional[List[str]],
  221. prefix_offset: int = 0,
  222. read_offset: int = 0,
  223. skip_special_tokens: bool = False,
  224. spaces_between_special_tokens: bool = True,
  225. ) -> Tuple[List[str], str, int, int]:
  226. new_token_id = all_input_ids[-1]
  227. # This is the first iteration for this sequence
  228. if prev_tokens is None:
  229. new_tokens = tokenizer.convert_ids_to_tokens(
  230. all_input_ids, skip_special_tokens=skip_special_tokens)
  231. output_tokens = new_tokens
  232. # 5 is an arbitrary value that should work for all
  233. # tokenizers (bigger = more conservative).
  234. # Subtract 1 extra to account for the generated token.
  235. prefix_offset = max(len(output_tokens) - 6, 0)
  236. # If the first new token is a special token, we can't skip 1 extra token
  237. if skip_special_tokens and new_token_id in tokenizer.all_special_ids:
  238. read_offset = max(len(output_tokens), 0)
  239. else:
  240. read_offset = max(len(output_tokens) - 1, 0)
  241. else:
  242. # Put new_token_id in a list so skip_special_tokens is respected
  243. new_tokens = tokenizer.convert_ids_to_tokens(
  244. [new_token_id], skip_special_tokens=skip_special_tokens)
  245. output_tokens = prev_tokens + new_tokens
  246. # The prefix text is necessary only to defeat cleanup algorithms in
  247. # the decode which decide to add a space or not depending on the
  248. # surrounding ids.
  249. if tokenizer.is_fast or not tokenizer.get_added_vocab():
  250. prefix_text = tokenizer.convert_tokens_to_string(
  251. output_tokens[prefix_offset:read_offset])
  252. new_text = tokenizer.convert_tokens_to_string(
  253. output_tokens[prefix_offset:])
  254. else:
  255. prefix_text = _convert_tokens_to_string_with_added_encoders(
  256. tokenizer,
  257. output_tokens[prefix_offset:read_offset],
  258. skip_special_tokens=skip_special_tokens,
  259. spaces_between_special_tokens=spaces_between_special_tokens,
  260. )
  261. new_text = _convert_tokens_to_string_with_added_encoders(
  262. tokenizer,
  263. output_tokens[prefix_offset:],
  264. skip_special_tokens=skip_special_tokens,
  265. spaces_between_special_tokens=spaces_between_special_tokens,
  266. )
  267. if len(new_text) > len(prefix_text) and not new_text.endswith("�"):
  268. # utf-8 char at the end means it's a potential unfinished byte sequence
  269. # from byte fallback tokenization.
  270. # If it's in the middle, it's probably a real invalid id generated
  271. # by the model
  272. new_text = new_text[len(prefix_text):]
  273. return new_tokens, new_text, read_offset, len(output_tokens)
  274. else:
  275. return new_tokens, "", prefix_offset, read_offset