japanese.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191
  1. # modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py
  2. import re
  3. import sys
  4. import pyopenjtalk
  5. from text import symbols
  6. # Regular expression matching Japanese without punctuation marks:
  7. _japanese_characters = re.compile(
  8. r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
  9. )
  10. # Regular expression matching non-Japanese characters or punctuation marks:
  11. _japanese_marks = re.compile(
  12. r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
  13. )
  14. # List of (symbol, Japanese) pairs for marks:
  15. _symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("%", "パーセント")]]
  16. # List of (consonant, sokuon) pairs:
  17. _real_sokuon = [
  18. (re.compile("%s" % x[0]), x[1])
  19. for x in [
  20. (r"Q([↑↓]*[kg])", r"k#\1"),
  21. (r"Q([↑↓]*[tdjʧ])", r"t#\1"),
  22. (r"Q([↑↓]*[sʃ])", r"s\1"),
  23. (r"Q([↑↓]*[pb])", r"p#\1"),
  24. ]
  25. ]
  26. # List of (consonant, hatsuon) pairs:
  27. _real_hatsuon = [
  28. (re.compile("%s" % x[0]), x[1])
  29. for x in [
  30. (r"N([↑↓]*[pbm])", r"m\1"),
  31. (r"N([↑↓]*[ʧʥj])", r"n^\1"),
  32. (r"N([↑↓]*[tdn])", r"n\1"),
  33. (r"N([↑↓]*[kg])", r"ŋ\1"),
  34. ]
  35. ]
  36. def post_replace_ph(ph):
  37. rep_map = {
  38. ":": ",",
  39. ";": ",",
  40. ",": ",",
  41. "。": ".",
  42. "!": "!",
  43. "?": "?",
  44. "\n": ".",
  45. "·": ",",
  46. "、": ",",
  47. "...": "…",
  48. }
  49. if ph in rep_map.keys():
  50. ph = rep_map[ph]
  51. if ph in symbols:
  52. return ph
  53. if ph not in symbols:
  54. ph = "UNK"
  55. return ph
  56. def symbols_to_japanese(text):
  57. for regex, replacement in _symbols_to_japanese:
  58. text = re.sub(regex, replacement, text)
  59. return text
  60. def preprocess_jap(text, with_prosody=False):
  61. """Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html"""
  62. text = symbols_to_japanese(text)
  63. sentences = re.split(_japanese_marks, text)
  64. marks = re.findall(_japanese_marks, text)
  65. text = []
  66. for i, sentence in enumerate(sentences):
  67. if re.match(_japanese_characters, sentence):
  68. if with_prosody:
  69. text += pyopenjtalk_g2p_prosody(sentence)[1:-1]
  70. else:
  71. p = pyopenjtalk.g2p(sentence)
  72. text += p.split(" ")
  73. if i < len(marks):
  74. if marks[i] == " ":# 防止意外的UNK
  75. continue
  76. text += [marks[i].replace(" ", "")]
  77. return text
  78. def text_normalize(text):
  79. # todo: jap text normalize
  80. return text
  81. # Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
  82. def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True):
  83. """Extract phoneme + prosoody symbol sequence from input full-context labels.
  84. The algorithm is based on `Prosodic features control by symbols as input of
  85. sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks.
  86. Args:
  87. text (str): Input text.
  88. drop_unvoiced_vowels (bool): whether to drop unvoiced vowels.
  89. Returns:
  90. List[str]: List of phoneme + prosody symbols.
  91. Examples:
  92. >>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody
  93. >>> pyopenjtalk_g2p_prosody("こんにちは。")
  94. ['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$']
  95. .. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic
  96. modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104
  97. """
  98. labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text))
  99. N = len(labels)
  100. phones = []
  101. for n in range(N):
  102. lab_curr = labels[n]
  103. # current phoneme
  104. p3 = re.search(r"\-(.*?)\+", lab_curr).group(1)
  105. # deal unvoiced vowels as normal vowels
  106. if drop_unvoiced_vowels and p3 in "AEIOU":
  107. p3 = p3.lower()
  108. # deal with sil at the beginning and the end of text
  109. if p3 == "sil":
  110. assert n == 0 or n == N - 1
  111. if n == 0:
  112. phones.append("^")
  113. elif n == N - 1:
  114. # check question form or not
  115. e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr)
  116. if e3 == 0:
  117. phones.append("$")
  118. elif e3 == 1:
  119. phones.append("?")
  120. continue
  121. elif p3 == "pau":
  122. phones.append("_")
  123. continue
  124. else:
  125. phones.append(p3)
  126. # accent type and position info (forward or backward)
  127. a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr)
  128. a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr)
  129. a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr)
  130. # number of mora in accent phrase
  131. f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr)
  132. a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1])
  133. # accent phrase border
  134. if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
  135. phones.append("#")
  136. # pitch falling
  137. elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
  138. phones.append("]")
  139. # pitch rising
  140. elif a2 == 1 and a2_next == 2:
  141. phones.append("[")
  142. return phones
  143. # Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
  144. def _numeric_feature_by_regex(regex, s):
  145. match = re.search(regex, s)
  146. if match is None:
  147. return -50
  148. return int(match.group(1))
  149. def g2p(norm_text, with_prosody=False):
  150. phones = preprocess_jap(norm_text, with_prosody)
  151. phones = [post_replace_ph(i) for i in phones]
  152. # todo: implement tones and word2ph
  153. return phones
  154. if __name__ == "__main__":
  155. phones = g2p("こんにちは, hello, AKITOです,よろしくお願いしますね!")
  156. print(phones)