text.py 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. from synthesizer.utils.symbols import symbols
  2. from synthesizer.utils import cleaners
  3. import re
  4. # Mappings from symbol to numeric ID and vice versa:
  5. _symbol_to_id = {s: i for i, s in enumerate(symbols)}
  6. _id_to_symbol = {i: s for i, s in enumerate(symbols)}
  7. # Regular expression matching text enclosed in curly braces:
  8. _curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")
  9. def text_to_sequence(text, cleaner_names):
  10. """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
  11. The text can optionally have ARPAbet sequences enclosed in curly braces embedded
  12. in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
  13. Args:
  14. text: string to convert to a sequence
  15. cleaner_names: names of the cleaner functions to run the text through
  16. Returns:
  17. List of integers corresponding to the symbols in the text
  18. """
  19. sequence = []
  20. # Check for curly braces and treat their contents as ARPAbet:
  21. while len(text):
  22. m = _curly_re.match(text)
  23. if not m:
  24. sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
  25. break
  26. sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
  27. sequence += _arpabet_to_sequence(m.group(2))
  28. text = m.group(3)
  29. # Append EOS token
  30. sequence.append(_symbol_to_id["~"])
  31. return sequence
  32. def sequence_to_text(sequence):
  33. """Converts a sequence of IDs back to a string"""
  34. result = ""
  35. for symbol_id in sequence:
  36. if symbol_id in _id_to_symbol:
  37. s = _id_to_symbol[symbol_id]
  38. # Enclose ARPAbet back in curly braces:
  39. if len(s) > 1 and s[0] == "@":
  40. s = "{%s}" % s[1:]
  41. result += s
  42. return result.replace("}{", " ")
  43. def _clean_text(text, cleaner_names):
  44. for name in cleaner_names:
  45. cleaner = getattr(cleaners, name)
  46. if not cleaner:
  47. raise Exception("Unknown cleaner: %s" % name)
  48. text = cleaner(text)
  49. return text
  50. def _symbols_to_sequence(symbols):
  51. return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
  52. def _arpabet_to_sequence(text):
  53. return _symbols_to_sequence(["@" + s for s in text.split()])
  54. def _should_keep_symbol(s):
  55. return s in _symbol_to_id and s not in ("_", "~")