chinese.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. import os
  2. import pdb
  3. import re
  4. import cn2an
  5. from pypinyin import lazy_pinyin, Style
  6. from text.symbols import punctuation
  7. from text.tone_sandhi import ToneSandhi
  8. from text.zh_normalization.text_normlization import TextNormalizer
  9. normalizer = lambda x: cn2an.transform(x, "an2cn")
  10. current_file_path = os.path.dirname(__file__)
  11. pinyin_to_symbol_map = {
  12. line.split("\t")[0]: line.strip().split("\t")[1]
  13. for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
  14. }
  15. import jieba_fast.posseg as psg
  16. rep_map = {
  17. ":": ",",
  18. ";": ",",
  19. ",": ",",
  20. "。": ".",
  21. "!": "!",
  22. "?": "?",
  23. "\n": ".",
  24. "·": ",",
  25. "、": ",",
  26. "...": "…",
  27. "$": ".",
  28. "/": ",",
  29. "—": "-",
  30. }
  31. tone_modifier = ToneSandhi()
  32. def replace_punctuation(text):
  33. text = text.replace("嗯", "恩").replace("呣", "母")
  34. pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
  35. replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
  36. replaced_text = re.sub(
  37. r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
  38. )
  39. return replaced_text
  40. def g2p(text):
  41. pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
  42. sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
  43. phones, word2ph = _g2p(sentences)
  44. return phones, word2ph
  45. def _get_initials_finals(word):
  46. initials = []
  47. finals = []
  48. orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
  49. orig_finals = lazy_pinyin(
  50. word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
  51. )
  52. for c, v in zip(orig_initials, orig_finals):
  53. initials.append(c)
  54. finals.append(v)
  55. return initials, finals
  56. def _g2p(segments):
  57. phones_list = []
  58. word2ph = []
  59. for seg in segments:
  60. pinyins = []
  61. # Replace all English words in the sentence
  62. seg = re.sub("[a-zA-Z]+", "", seg)
  63. seg_cut = psg.lcut(seg)
  64. initials = []
  65. finals = []
  66. seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
  67. for word, pos in seg_cut:
  68. if pos == "eng":
  69. continue
  70. sub_initials, sub_finals = _get_initials_finals(word)
  71. sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
  72. initials.append(sub_initials)
  73. finals.append(sub_finals)
  74. # assert len(sub_initials) == len(sub_finals) == len(word)
  75. initials = sum(initials, [])
  76. finals = sum(finals, [])
  77. #
  78. for c, v in zip(initials, finals):
  79. raw_pinyin = c + v
  80. # NOTE: post process for pypinyin outputs
  81. # we discriminate i, ii and iii
  82. if c == v:
  83. assert c in punctuation
  84. phone = [c]
  85. word2ph.append(1)
  86. else:
  87. v_without_tone = v[:-1]
  88. tone = v[-1]
  89. pinyin = c + v_without_tone
  90. assert tone in "12345"
  91. if c:
  92. # 多音节
  93. v_rep_map = {
  94. "uei": "ui",
  95. "iou": "iu",
  96. "uen": "un",
  97. }
  98. if v_without_tone in v_rep_map.keys():
  99. pinyin = c + v_rep_map[v_without_tone]
  100. else:
  101. # 单音节
  102. pinyin_rep_map = {
  103. "ing": "ying",
  104. "i": "yi",
  105. "in": "yin",
  106. "u": "wu",
  107. }
  108. if pinyin in pinyin_rep_map.keys():
  109. pinyin = pinyin_rep_map[pinyin]
  110. else:
  111. single_rep_map = {
  112. "v": "yu",
  113. "e": "e",
  114. "i": "y",
  115. "u": "w",
  116. }
  117. if pinyin[0] in single_rep_map.keys():
  118. pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
  119. assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
  120. new_c, new_v = pinyin_to_symbol_map[pinyin].split(" ")
  121. new_v = new_v + tone
  122. phone = [new_c, new_v]
  123. word2ph.append(len(phone))
  124. phones_list += phone
  125. return phones_list, word2ph
  126. def text_normalize(text):
  127. # https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
  128. tx = TextNormalizer()
  129. sentences = tx.normalize(text)
  130. dest_text = ""
  131. for sentence in sentences:
  132. dest_text += replace_punctuation(sentence)
  133. return dest_text
  134. if __name__ == "__main__":
  135. text = "啊——但是《原神》是由,米哈\游自主,研发的一款全.新开放世界.冒险游戏"
  136. text = "呣呣呣~就是…大人的鼹鼠党吧?"
  137. text = "你好"
  138. text = text_normalize(text)
  139. print(g2p(text))
  140. # # 示例用法
  141. # text = "这是一个示例文本:,你好!这是一个测试..."
  142. # print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试