123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301 |
- import os
- import tempfile
- from typing import List, Optional, Tuple, Union
- import gguf
- from transformers import (AutoTokenizer, PreTrainedTokenizer,
- PreTrainedTokenizerFast, LlamaTokenizer)
- from transformers.convert_slow_tokenizer import import_protobuf
- from aphrodite.common.logger import init_logger
- from aphrodite.lora.request import LoRARequest
- from aphrodite.common.utils import make_async, LRUCache
- from aphrodite.transformers_utils.tokenizers import BaichuanTokenizer
- logger = init_logger(__name__)
- def convert_gguf_to_tokenizer(checkpoint):
- result = gguf.GGUFReader(checkpoint)
- # write vocab
- sentencepiece_model_pb2 = import_protobuf()
- vocab = sentencepiece_model_pb2.ModelProto()
- vocab_size = len(result.fields['tokenizer.ggml.token_type'].data)
- vocab.trainer_spec.model_type = 2 # BPE
- vocab.trainer_spec.vocab_size = vocab_size
- vocab.trainer_spec.byte_fallback = True
- vocab.normalizer_spec.remove_extra_whitespaces = False
- tokens = result.fields['tokenizer.ggml.tokens']
- scores = result.fields['tokenizer.ggml.scores']
- types = result.fields['tokenizer.ggml.token_type']
- for i in range(vocab_size):
- new_token = vocab.SentencePiece()
- new_token.piece = str(bytes(tokens.parts[tokens.data[i]]),
- encoding='utf-8')
- new_token.score = scores.parts[scores.data[i]]
- # llama.cpp tokentype is the same with sentencepiece token type
- new_token.type = int(types.parts[types.data[i]])
- vocab.pieces.append(new_token)
- with tempfile.NamedTemporaryFile(mode='wb', delete=False) as temp_file:
- temp_file.write(vocab.SerializeToString())
- temp_file_filename = temp_file.name
- tokenizer_args = {"vocab_file": temp_file_filename}
- if 'tokenizer.ggml.bos_token_id' in result.fields:
- tokenizer_args["bos_token"] = vocab.pieces[int(
- result.fields['tokenizer.ggml.bos_token_id'].parts[-1])].piece
- if 'tokenizer.ggml.eos_token_id' in result.fields:
- tokenizer_args["eos_token"] = vocab.pieces[int(
- result.fields['tokenizer.ggml.eos_token_id'].parts[-1])].piece
- if 'tokenizer.ggml.padding_token_id' in result.fields:
- tokenizer_args["pad_token"] = vocab.pieces[int(
- result.fields['tokenizer.ggml.padding_token_id'].parts[-1])].piece
- if 'tokenizer.ggml.unknown_token_id' in result.fields:
- tokenizer_args["unk_token"] = vocab.pieces[int(
- result.fields['tokenizer.ggml.unknown_token_id'].parts[-1])].piece
- if 'tokenizer.ggml.add_bos_token' in result.fields:
- tokenizer_args["add_bos_token"] = bool(
- result.fields['tokenizer.ggml.add_bos_token'].parts[-1])
- if 'tokenizer.ggml.add_eos_token' in result.fields:
- tokenizer_args["add_eos_token"] = bool(
- result.fields['tokenizer.ggml.add_eos_token'].parts[-1])
- tokenizer = LlamaTokenizer(**tokenizer_args)
- os.unlink(temp_file_filename)
- return tokenizer
- def get_tokenizer(
- tokenizer_name: str,
- *args,
- tokenizer_mode: str = "auto",
- trust_remote_code: bool = False,
- tokenizer_revision: Optional[str] = None,
- **kwargs,
- ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
- """Gets a tokenizer for the given model name via Huggingface."""
- if tokenizer_name.endswith("gguf"):
- return convert_gguf_to_tokenizer(tokenizer_name)
- if tokenizer_mode == "slow":
- if kwargs.get("use_fast", False):
- raise ValueError(
- "Cannot use the fast tokenizer in slow tokenizer mode.")
- kwargs["use_fast"] = False
- try:
- tokenizer = AutoTokenizer.from_pretrained(
- tokenizer_name,
- *args,
- trust_remote_code=trust_remote_code,
- tokenizer_revision=tokenizer_revision,
- **kwargs)
- except ValueError as e:
- # If the error pertains to the tokenizer class not existing or not
- # currently being imported, suggest using the --trust-remote-code flag.
- if (not trust_remote_code and
- ("does not exist or is not currently imported." in str(e)
- or "requires you to execute the tokenizer file" in str(e))):
- err_msg = (
- "Failed to load the tokenizer. If the tokenizer is a custom "
- "tokenizer not yet available in the HuggingFace transformers "
- "library, consider setting `trust_remote_code=True` in LLM "
- "or using the `--trust-remote-code` flag in the CLI.")
- raise RuntimeError(err_msg) from e
- else:
- raise e
- except AttributeError as e:
- if "BaichuanTokenizer" in str(e):
- # This is for the error "'BaichuanTokenizer' object has no
- # attribute 'sp_model'".
- tokenizer = BaichuanTokenizer.from_pretrained(
- tokenizer_name,
- *args,
- trust_remote_code=trust_remote_code,
- tokenizer_revision=tokenizer_revision,
- **kwargs)
- else:
- raise e
- if not isinstance(tokenizer, PreTrainedTokenizerFast):
- logger.warning(
- "Using a slow tokenizer. This might cause a significant "
- "slowdown. Consider using a fast tokenizer instead.")
- return tokenizer
- def get_lora_tokenizer(lora_request: LoRARequest, *args,
- **kwargs) -> Optional[PreTrainedTokenizer]:
- if lora_request is None:
- return None
- try:
- tokenizer = get_tokenizer(lora_request.lora_local_path, *args,
- **kwargs)
- except OSError as e:
- # No tokenizer was found in the LoRA folder,
- # use base model tokenizer
- logger.warning(
- f"No tokenizer found in {lora_request.lora_local_path}, "
- "using base model tokenizer instead. "
- f"(Exception: {str(e)})")
- tokenizer = None
- return tokenizer
- get_lora_tokenizer_async = make_async(get_lora_tokenizer)
- class TokenizerGroup:
- """A group of tokenizers that can be used for LoRA adapters."""
- def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
- max_input_length: Optional[int], **tokenizer_config):
- self.tokenizer_id = tokenizer_id
- self.tokenizer_config = tokenizer_config
- self.enable_lora = enable_lora
- self.max_input_length = max_input_length
- self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config)
- if enable_lora:
- self.lora_tokenizers = LRUCache(capacity=max_num_seqs)
- else:
- self.lora_tokenizers = None
- def encode(self,
- prompt: str,
- request_id: Optional[str] = None,
- lora_request: Optional[LoRARequest] = None) -> List[int]:
- tokenizer = self.get_lora_tokenizer(lora_request)
- return tokenizer.encode(prompt)
- async def encode_async(
- self,
- prompt: str,
- request_id: Optional[str] = None,
- lora_request: Optional[LoRARequest] = None) -> List[int]:
- tokenizer = await self.get_lora_tokenizer_async(lora_request)
- return tokenizer.encode(prompt)
- def get_lora_tokenizer(
- self,
- lora_request: Optional[LoRARequest]) -> "PreTrainedTokenizer":
- if not lora_request or not self.enable_lora:
- return self.tokenizer
- if lora_request.lora_int_id not in self.lora_tokenizers:
- tokenizer = (get_lora_tokenizer(
- lora_request, **self.tokenizer_config) or self.tokenizer)
- self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
- return tokenizer
- else:
- return self.lora_tokenizers.get(lora_request.lora_int_id)
- async def get_lora_tokenizer_async(
- self,
- lora_request: Optional[LoRARequest]) -> "PreTrainedTokenizer":
- if not lora_request or not self.enable_lora:
- return self.tokenizer
- if lora_request.lora_int_id not in self.lora_tokenizers:
- tokenizer = (await get_lora_tokenizer_async(
- lora_request, **self.tokenizer_config) or self.tokenizer)
- self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
- return tokenizer
- else:
- return self.lora_tokenizers.get(lora_request.lora_int_id)
- def _convert_tokens_to_string_with_added_encoders(
- tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
- output_tokens: List[str],
- skip_special_tokens: bool,
- spaces_between_special_tokens: bool,
- ) -> str:
- # Adapted from
- # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921
- # NOTE: The following code is slow because it runs a for loop over
- # the output_tokens. In Python, running a for loop over a list can be slow
- # even when the loop body is very simple.
- sub_texts = []
- current_sub_text = []
- all_special_tokens = set(tokenizer.all_special_tokens)
- for token in output_tokens:
- if skip_special_tokens and token in all_special_tokens:
- continue
- if token in tokenizer.get_added_vocab():
- if current_sub_text:
- sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
- sub_texts.append(sub_text)
- current_sub_text = []
- sub_texts.append(token)
- else:
- current_sub_text.append(token)
- if current_sub_text:
- sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
- sub_texts.append(sub_text)
- if spaces_between_special_tokens:
- return " ".join(sub_texts)
- else:
- return "".join(sub_texts)
- # Based on
- # https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15
- # under Apache 2.0 license
- def detokenize_incrementally(
- tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
- all_input_ids: List[int],
- prev_tokens: Optional[List[str]],
- prefix_offset: int = 0,
- read_offset: int = 0,
- skip_special_tokens: bool = False,
- spaces_between_special_tokens: bool = True,
- ) -> Tuple[List[str], str, int, int]:
- new_token_id = all_input_ids[-1]
- # This is the first iteration for this sequence
- if prev_tokens is None:
- new_tokens = tokenizer.convert_ids_to_tokens(
- all_input_ids, skip_special_tokens=skip_special_tokens)
- output_tokens = new_tokens
- # 5 is an arbitrary value that should work for all
- # tokenizers (bigger = more conservative).
- # Subtract 1 extra to account for the generated token.
- prefix_offset = max(len(output_tokens) - 6, 0)
- # If the first new token is a special token, we can't skip 1 extra token
- if skip_special_tokens and new_token_id in tokenizer.all_special_ids:
- read_offset = max(len(output_tokens), 0)
- else:
- read_offset = max(len(output_tokens) - 1, 0)
- else:
- # Put new_token_id in a list so skip_special_tokens is respected
- new_tokens = tokenizer.convert_ids_to_tokens(
- [new_token_id], skip_special_tokens=skip_special_tokens)
- output_tokens = prev_tokens + new_tokens
- # The prefix text is necessary only to defeat cleanup algorithms in
- # the decode which decide to add a space or not depending on the
- # surrounding ids.
- if tokenizer.is_fast or not tokenizer.get_added_vocab():
- prefix_text = tokenizer.convert_tokens_to_string(
- output_tokens[prefix_offset:read_offset])
- new_text = tokenizer.convert_tokens_to_string(
- output_tokens[prefix_offset:])
- else:
- prefix_text = _convert_tokens_to_string_with_added_encoders(
- tokenizer,
- output_tokens[prefix_offset:read_offset],
- skip_special_tokens=skip_special_tokens,
- spaces_between_special_tokens=spaces_between_special_tokens,
- )
- new_text = _convert_tokens_to_string_with_added_encoders(
- tokenizer,
- output_tokens[prefix_offset:],
- skip_special_tokens=skip_special_tokens,
- spaces_between_special_tokens=spaces_between_special_tokens,
- )
- if len(new_text) > len(prefix_text) and not new_text.endswith("�"):
- # utf-8 char at the end means it's a potential unfinished byte sequence
- # from byte fallback tokenization.
- # If it's in the middle, it's probably a real invalid id generated
- # by the model
- new_text = new_text[len(prefix_text):]
- return new_tokens, new_text, read_offset, len(output_tokens)
- else:
- return new_tokens, "", prefix_offset, read_offset
|