123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191 |
- # modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py
- import re
- import sys
- import pyopenjtalk
- from text import symbols
- # Regular expression matching Japanese without punctuation marks:
- _japanese_characters = re.compile(
- r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
- )
- # Regular expression matching non-Japanese characters or punctuation marks:
- _japanese_marks = re.compile(
- r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
- )
- # List of (symbol, Japanese) pairs for marks:
- _symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("%", "パーセント")]]
- # List of (consonant, sokuon) pairs:
- _real_sokuon = [
- (re.compile("%s" % x[0]), x[1])
- for x in [
- (r"Q([↑↓]*[kg])", r"k#\1"),
- (r"Q([↑↓]*[tdjʧ])", r"t#\1"),
- (r"Q([↑↓]*[sʃ])", r"s\1"),
- (r"Q([↑↓]*[pb])", r"p#\1"),
- ]
- ]
- # List of (consonant, hatsuon) pairs:
- _real_hatsuon = [
- (re.compile("%s" % x[0]), x[1])
- for x in [
- (r"N([↑↓]*[pbm])", r"m\1"),
- (r"N([↑↓]*[ʧʥj])", r"n^\1"),
- (r"N([↑↓]*[tdn])", r"n\1"),
- (r"N([↑↓]*[kg])", r"ŋ\1"),
- ]
- ]
- def post_replace_ph(ph):
- rep_map = {
- ":": ",",
- ";": ",",
- ",": ",",
- "。": ".",
- "!": "!",
- "?": "?",
- "\n": ".",
- "·": ",",
- "、": ",",
- "...": "…",
- }
- if ph in rep_map.keys():
- ph = rep_map[ph]
- if ph in symbols:
- return ph
- if ph not in symbols:
- ph = "UNK"
- return ph
- def symbols_to_japanese(text):
- for regex, replacement in _symbols_to_japanese:
- text = re.sub(regex, replacement, text)
- return text
- def preprocess_jap(text, with_prosody=False):
- """Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html"""
- text = symbols_to_japanese(text)
- sentences = re.split(_japanese_marks, text)
- marks = re.findall(_japanese_marks, text)
- text = []
- for i, sentence in enumerate(sentences):
- if re.match(_japanese_characters, sentence):
- if with_prosody:
- text += pyopenjtalk_g2p_prosody(sentence)[1:-1]
- else:
- p = pyopenjtalk.g2p(sentence)
- text += p.split(" ")
- if i < len(marks):
- if marks[i] == " ":# 防止意外的UNK
- continue
- text += [marks[i].replace(" ", "")]
- return text
- def text_normalize(text):
- # todo: jap text normalize
- return text
- # Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
- def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True):
- """Extract phoneme + prosoody symbol sequence from input full-context labels.
- The algorithm is based on `Prosodic features control by symbols as input of
- sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks.
- Args:
- text (str): Input text.
- drop_unvoiced_vowels (bool): whether to drop unvoiced vowels.
- Returns:
- List[str]: List of phoneme + prosody symbols.
- Examples:
- >>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody
- >>> pyopenjtalk_g2p_prosody("こんにちは。")
- ['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$']
- .. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic
- modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104
- """
- labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text))
- N = len(labels)
- phones = []
- for n in range(N):
- lab_curr = labels[n]
- # current phoneme
- p3 = re.search(r"\-(.*?)\+", lab_curr).group(1)
- # deal unvoiced vowels as normal vowels
- if drop_unvoiced_vowels and p3 in "AEIOU":
- p3 = p3.lower()
- # deal with sil at the beginning and the end of text
- if p3 == "sil":
- assert n == 0 or n == N - 1
- if n == 0:
- phones.append("^")
- elif n == N - 1:
- # check question form or not
- e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr)
- if e3 == 0:
- phones.append("$")
- elif e3 == 1:
- phones.append("?")
- continue
- elif p3 == "pau":
- phones.append("_")
- continue
- else:
- phones.append(p3)
- # accent type and position info (forward or backward)
- a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr)
- a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr)
- a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr)
- # number of mora in accent phrase
- f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr)
- a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1])
- # accent phrase border
- if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
- phones.append("#")
- # pitch falling
- elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
- phones.append("]")
- # pitch rising
- elif a2 == 1 and a2_next == 2:
- phones.append("[")
- return phones
- # Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
- def _numeric_feature_by_regex(regex, s):
- match = re.search(regex, s)
- if match is None:
- return -50
- return int(match.group(1))
- def g2p(norm_text, with_prosody=False):
- phones = preprocess_jap(norm_text, with_prosody)
- phones = [post_replace_ph(i) for i in phones]
- # todo: implement tones and word2ph
- return phones
- if __name__ == "__main__":
- phones = g2p("こんにちは, hello, AKITOです,よろしくお願いしますね!")
- print(phones)
|