123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172 |
- import os
- import pdb
- import re
- import cn2an
- from pypinyin import lazy_pinyin, Style
- from text.symbols import punctuation
- from text.tone_sandhi import ToneSandhi
- from text.zh_normalization.text_normlization import TextNormalizer
- normalizer = lambda x: cn2an.transform(x, "an2cn")
- current_file_path = os.path.dirname(__file__)
- pinyin_to_symbol_map = {
- line.split("\t")[0]: line.strip().split("\t")[1]
- for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
- }
- import jieba_fast.posseg as psg
- rep_map = {
- ":": ",",
- ";": ",",
- ",": ",",
- "。": ".",
- "!": "!",
- "?": "?",
- "\n": ".",
- "·": ",",
- "、": ",",
- "...": "…",
- "$": ".",
- "/": ",",
- "—": "-",
- }
- tone_modifier = ToneSandhi()
- def replace_punctuation(text):
- text = text.replace("嗯", "恩").replace("呣", "母")
- pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
- replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
- replaced_text = re.sub(
- r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
- )
- return replaced_text
- def g2p(text):
- pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
- sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
- phones, word2ph = _g2p(sentences)
- return phones, word2ph
- def _get_initials_finals(word):
- initials = []
- finals = []
- orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
- orig_finals = lazy_pinyin(
- word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
- )
- for c, v in zip(orig_initials, orig_finals):
- initials.append(c)
- finals.append(v)
- return initials, finals
- def _g2p(segments):
- phones_list = []
- word2ph = []
- for seg in segments:
- pinyins = []
- # Replace all English words in the sentence
- seg = re.sub("[a-zA-Z]+", "", seg)
- seg_cut = psg.lcut(seg)
- initials = []
- finals = []
- seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
- for word, pos in seg_cut:
- if pos == "eng":
- continue
- sub_initials, sub_finals = _get_initials_finals(word)
- sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
- initials.append(sub_initials)
- finals.append(sub_finals)
- # assert len(sub_initials) == len(sub_finals) == len(word)
- initials = sum(initials, [])
- finals = sum(finals, [])
- #
- for c, v in zip(initials, finals):
- raw_pinyin = c + v
- # NOTE: post process for pypinyin outputs
- # we discriminate i, ii and iii
- if c == v:
- assert c in punctuation
- phone = [c]
- word2ph.append(1)
- else:
- v_without_tone = v[:-1]
- tone = v[-1]
- pinyin = c + v_without_tone
- assert tone in "12345"
- if c:
- # 多音节
- v_rep_map = {
- "uei": "ui",
- "iou": "iu",
- "uen": "un",
- }
- if v_without_tone in v_rep_map.keys():
- pinyin = c + v_rep_map[v_without_tone]
- else:
- # 单音节
- pinyin_rep_map = {
- "ing": "ying",
- "i": "yi",
- "in": "yin",
- "u": "wu",
- }
- if pinyin in pinyin_rep_map.keys():
- pinyin = pinyin_rep_map[pinyin]
- else:
- single_rep_map = {
- "v": "yu",
- "e": "e",
- "i": "y",
- "u": "w",
- }
- if pinyin[0] in single_rep_map.keys():
- pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
- assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
- new_c, new_v = pinyin_to_symbol_map[pinyin].split(" ")
- new_v = new_v + tone
- phone = [new_c, new_v]
- word2ph.append(len(phone))
- phones_list += phone
- return phones_list, word2ph
- def text_normalize(text):
- # https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
- tx = TextNormalizer()
- sentences = tx.normalize(text)
- dest_text = ""
- for sentence in sentences:
- dest_text += replace_punctuation(sentence)
- return dest_text
- if __name__ == "__main__":
- text = "啊——但是《原神》是由,米哈\游自主,研发的一款全.新开放世界.冒险游戏"
- text = "呣呣呣~就是…大人的鼹鼠党吧?"
- text = "你好"
- text = text_normalize(text)
- print(g2p(text))
- # # 示例用法
- # text = "这是一个示例文本:,你好!这是一个测试..."
- # print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试
|