tone_sandhi.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806
  1. # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from typing import List
  15. from typing import Tuple
  16. import jieba_fast as jieba
  17. from pypinyin import lazy_pinyin
  18. from pypinyin import Style
  19. class ToneSandhi:
  20. def __init__(self):
  21. self.must_neural_tone_words = {
  22. "麻烦",
  23. "麻利",
  24. "鸳鸯",
  25. "高粱",
  26. "骨头",
  27. "骆驼",
  28. "马虎",
  29. "首饰",
  30. "馒头",
  31. "馄饨",
  32. "风筝",
  33. "难为",
  34. "队伍",
  35. "阔气",
  36. "闺女",
  37. "门道",
  38. "锄头",
  39. "铺盖",
  40. "铃铛",
  41. "铁匠",
  42. "钥匙",
  43. "里脊",
  44. "里头",
  45. "部分",
  46. "那么",
  47. "道士",
  48. "造化",
  49. "迷糊",
  50. "连累",
  51. "这么",
  52. "这个",
  53. "运气",
  54. "过去",
  55. "软和",
  56. "转悠",
  57. "踏实",
  58. "跳蚤",
  59. "跟头",
  60. "趔趄",
  61. "财主",
  62. "豆腐",
  63. "讲究",
  64. "记性",
  65. "记号",
  66. "认识",
  67. "规矩",
  68. "见识",
  69. "裁缝",
  70. "补丁",
  71. "衣裳",
  72. "衣服",
  73. "衙门",
  74. "街坊",
  75. "行李",
  76. "行当",
  77. "蛤蟆",
  78. "蘑菇",
  79. "薄荷",
  80. "葫芦",
  81. "葡萄",
  82. "萝卜",
  83. "荸荠",
  84. "苗条",
  85. "苗头",
  86. "苍蝇",
  87. "芝麻",
  88. "舒服",
  89. "舒坦",
  90. "舌头",
  91. "自在",
  92. "膏药",
  93. "脾气",
  94. "脑袋",
  95. "脊梁",
  96. "能耐",
  97. "胳膊",
  98. "胭脂",
  99. "胡萝",
  100. "胡琴",
  101. "胡同",
  102. "聪明",
  103. "耽误",
  104. "耽搁",
  105. "耷拉",
  106. "耳朵",
  107. "老爷",
  108. "老实",
  109. "老婆",
  110. "老头",
  111. "老太",
  112. "翻腾",
  113. "罗嗦",
  114. "罐头",
  115. "编辑",
  116. "结实",
  117. "红火",
  118. "累赘",
  119. "糨糊",
  120. "糊涂",
  121. "精神",
  122. "粮食",
  123. "簸箕",
  124. "篱笆",
  125. "算计",
  126. "算盘",
  127. "答应",
  128. "笤帚",
  129. "笑语",
  130. "笑话",
  131. "窟窿",
  132. "窝囊",
  133. "窗户",
  134. "稳当",
  135. "稀罕",
  136. "称呼",
  137. "秧歌",
  138. "秀气",
  139. "秀才",
  140. "福气",
  141. "祖宗",
  142. "砚台",
  143. "码头",
  144. "石榴",
  145. "石头",
  146. "石匠",
  147. "知识",
  148. "眼睛",
  149. "眯缝",
  150. "眨巴",
  151. "眉毛",
  152. "相声",
  153. "盘算",
  154. "白净",
  155. "痢疾",
  156. "痛快",
  157. "疟疾",
  158. "疙瘩",
  159. "疏忽",
  160. "畜生",
  161. "生意",
  162. "甘蔗",
  163. "琵琶",
  164. "琢磨",
  165. "琉璃",
  166. "玻璃",
  167. "玫瑰",
  168. "玄乎",
  169. "狐狸",
  170. "状元",
  171. "特务",
  172. "牲口",
  173. "牙碜",
  174. "牌楼",
  175. "爽快",
  176. "爱人",
  177. "热闹",
  178. "烧饼",
  179. "烟筒",
  180. "烂糊",
  181. "点心",
  182. "炊帚",
  183. "灯笼",
  184. "火候",
  185. "漂亮",
  186. "滑溜",
  187. "溜达",
  188. "温和",
  189. "清楚",
  190. "消息",
  191. "浪头",
  192. "活泼",
  193. "比方",
  194. "正经",
  195. "欺负",
  196. "模糊",
  197. "槟榔",
  198. "棺材",
  199. "棒槌",
  200. "棉花",
  201. "核桃",
  202. "栅栏",
  203. "柴火",
  204. "架势",
  205. "枕头",
  206. "枇杷",
  207. "机灵",
  208. "本事",
  209. "木头",
  210. "木匠",
  211. "朋友",
  212. "月饼",
  213. "月亮",
  214. "暖和",
  215. "明白",
  216. "时候",
  217. "新鲜",
  218. "故事",
  219. "收拾",
  220. "收成",
  221. "提防",
  222. "挖苦",
  223. "挑剔",
  224. "指甲",
  225. "指头",
  226. "拾掇",
  227. "拳头",
  228. "拨弄",
  229. "招牌",
  230. "招呼",
  231. "抬举",
  232. "护士",
  233. "折腾",
  234. "扫帚",
  235. "打量",
  236. "打算",
  237. "打点",
  238. "打扮",
  239. "打听",
  240. "打发",
  241. "扎实",
  242. "扁担",
  243. "戒指",
  244. "懒得",
  245. "意识",
  246. "意思",
  247. "情形",
  248. "悟性",
  249. "怪物",
  250. "思量",
  251. "怎么",
  252. "念头",
  253. "念叨",
  254. "快活",
  255. "忙活",
  256. "志气",
  257. "心思",
  258. "得罪",
  259. "张罗",
  260. "弟兄",
  261. "开通",
  262. "应酬",
  263. "庄稼",
  264. "干事",
  265. "帮手",
  266. "帐篷",
  267. "希罕",
  268. "师父",
  269. "师傅",
  270. "巴结",
  271. "巴掌",
  272. "差事",
  273. "工夫",
  274. "岁数",
  275. "屁股",
  276. "尾巴",
  277. "少爷",
  278. "小气",
  279. "小伙",
  280. "将就",
  281. "对头",
  282. "对付",
  283. "寡妇",
  284. "家伙",
  285. "客气",
  286. "实在",
  287. "官司",
  288. "学问",
  289. "学生",
  290. "字号",
  291. "嫁妆",
  292. "媳妇",
  293. "媒人",
  294. "婆家",
  295. "娘家",
  296. "委屈",
  297. "姑娘",
  298. "姐夫",
  299. "妯娌",
  300. "妥当",
  301. "妖精",
  302. "奴才",
  303. "女婿",
  304. "头发",
  305. "太阳",
  306. "大爷",
  307. "大方",
  308. "大意",
  309. "大夫",
  310. "多少",
  311. "多么",
  312. "外甥",
  313. "壮实",
  314. "地道",
  315. "地方",
  316. "在乎",
  317. "困难",
  318. "嘴巴",
  319. "嘱咐",
  320. "嘟囔",
  321. "嘀咕",
  322. "喜欢",
  323. "喇嘛",
  324. "喇叭",
  325. "商量",
  326. "唾沫",
  327. "哑巴",
  328. "哈欠",
  329. "哆嗦",
  330. "咳嗽",
  331. "和尚",
  332. "告诉",
  333. "告示",
  334. "含糊",
  335. "吓唬",
  336. "后头",
  337. "名字",
  338. "名堂",
  339. "合同",
  340. "吆喝",
  341. "叫唤",
  342. "口袋",
  343. "厚道",
  344. "厉害",
  345. "千斤",
  346. "包袱",
  347. "包涵",
  348. "匀称",
  349. "勤快",
  350. "动静",
  351. "动弹",
  352. "功夫",
  353. "力气",
  354. "前头",
  355. "刺猬",
  356. "刺激",
  357. "别扭",
  358. "利落",
  359. "利索",
  360. "利害",
  361. "分析",
  362. "出息",
  363. "凑合",
  364. "凉快",
  365. "冷战",
  366. "冤枉",
  367. "冒失",
  368. "养活",
  369. "关系",
  370. "先生",
  371. "兄弟",
  372. "便宜",
  373. "使唤",
  374. "佩服",
  375. "作坊",
  376. "体面",
  377. "位置",
  378. "似的",
  379. "伙计",
  380. "休息",
  381. "什么",
  382. "人家",
  383. "亲戚",
  384. "亲家",
  385. "交情",
  386. "云彩",
  387. "事情",
  388. "买卖",
  389. "主意",
  390. "丫头",
  391. "丧气",
  392. "两口",
  393. "东西",
  394. "东家",
  395. "世故",
  396. "不由",
  397. "不在",
  398. "下水",
  399. "下巴",
  400. "上头",
  401. "上司",
  402. "丈夫",
  403. "丈人",
  404. "一辈",
  405. "那个",
  406. "菩萨",
  407. "父亲",
  408. "母亲",
  409. "咕噜",
  410. "邋遢",
  411. "费用",
  412. "冤家",
  413. "甜头",
  414. "介绍",
  415. "荒唐",
  416. "大人",
  417. "泥鳅",
  418. "幸福",
  419. "熟悉",
  420. "计划",
  421. "扑腾",
  422. "蜡烛",
  423. "姥爷",
  424. "照顾",
  425. "喉咙",
  426. "吉他",
  427. "弄堂",
  428. "蚂蚱",
  429. "凤凰",
  430. "拖沓",
  431. "寒碜",
  432. "糟蹋",
  433. "倒腾",
  434. "报复",
  435. "逻辑",
  436. "盘缠",
  437. "喽啰",
  438. "牢骚",
  439. "咖喱",
  440. "扫把",
  441. "惦记",
  442. }
  443. self.must_not_neural_tone_words = {
  444. "男子",
  445. "女子",
  446. "分子",
  447. "原子",
  448. "量子",
  449. "莲子",
  450. "石子",
  451. "瓜子",
  452. "电子",
  453. "人人",
  454. "虎虎",
  455. "幺幺",
  456. "干嘛",
  457. "学子",
  458. "哈哈",
  459. "数数",
  460. "袅袅",
  461. "局地",
  462. "以下",
  463. "娃哈哈",
  464. "花花草草",
  465. "留得",
  466. "耕地",
  467. "想想",
  468. "熙熙",
  469. "攘攘",
  470. "卵子",
  471. "死死",
  472. "冉冉",
  473. "恳恳",
  474. "佼佼",
  475. "吵吵",
  476. "打打",
  477. "考考",
  478. "整整",
  479. "莘莘",
  480. "落地",
  481. "算子",
  482. "家家户户",
  483. "青青",
  484. }
  485. self.punc = ":,;。?!“”‘’':,;.?!"
  486. # the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
  487. # e.g.
  488. # word: "家里"
  489. # pos: "s"
  490. # finals: ['ia1', 'i3']
  491. def _neural_sandhi(self, word: str, pos: str, finals: List[str]) -> List[str]:
  492. # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
  493. for j, item in enumerate(word):
  494. if (
  495. j - 1 >= 0
  496. and item == word[j - 1]
  497. and pos[0] in {"n", "v", "a"}
  498. and word not in self.must_not_neural_tone_words
  499. ):
  500. finals[j] = finals[j][:-1] + "5"
  501. ge_idx = word.find("个")
  502. if len(word) >= 1 and word[-1] in "吧呢哈啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶":
  503. finals[-1] = finals[-1][:-1] + "5"
  504. elif len(word) >= 1 and word[-1] in "的地得":
  505. finals[-1] = finals[-1][:-1] + "5"
  506. # e.g. 走了, 看着, 去过
  507. elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}:
  508. finals[-1] = finals[-1][:-1] + "5"
  509. elif (
  510. len(word) > 1
  511. and word[-1] in "们子"
  512. and pos in {"r", "n"}
  513. and word not in self.must_not_neural_tone_words
  514. ):
  515. finals[-1] = finals[-1][:-1] + "5"
  516. # e.g. 桌上, 地下, 家里
  517. elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}:
  518. finals[-1] = finals[-1][:-1] + "5"
  519. # e.g. 上来, 下去
  520. elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开":
  521. finals[-1] = finals[-1][:-1] + "5"
  522. # 个做量词
  523. elif (
  524. ge_idx >= 1
  525. and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是")
  526. ) or word == "个":
  527. finals[ge_idx] = finals[ge_idx][:-1] + "5"
  528. else:
  529. if (
  530. word in self.must_neural_tone_words
  531. or word[-2:] in self.must_neural_tone_words
  532. ):
  533. finals[-1] = finals[-1][:-1] + "5"
  534. word_list = self._split_word(word)
  535. finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
  536. for i, word in enumerate(word_list):
  537. # conventional neural in Chinese
  538. if (
  539. word in self.must_neural_tone_words
  540. or word[-2:] in self.must_neural_tone_words
  541. ):
  542. finals_list[i][-1] = finals_list[i][-1][:-1] + "5"
  543. finals = sum(finals_list, [])
  544. return finals
  545. def _bu_sandhi(self, word: str, finals: List[str]) -> List[str]:
  546. # e.g. 看不懂
  547. if len(word) == 3 and word[1] == "不":
  548. finals[1] = finals[1][:-1] + "5"
  549. else:
  550. for i, char in enumerate(word):
  551. # "不" before tone4 should be bu2, e.g. 不怕
  552. if char == "不" and i + 1 < len(word) and finals[i + 1][-1] == "4":
  553. finals[i] = finals[i][:-1] + "2"
  554. return finals
  555. def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]:
  556. # "一" in number sequences, e.g. 一零零, 二一零
  557. if word.find("一") != -1 and all(
  558. [item.isnumeric() for item in word if item != "一"]
  559. ):
  560. return finals
  561. # "一" between reduplication words shold be yi5, e.g. 看一看
  562. elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]:
  563. finals[1] = finals[1][:-1] + "5"
  564. # when "一" is ordinal word, it should be yi1
  565. elif word.startswith("第一"):
  566. finals[1] = finals[1][:-1] + "1"
  567. else:
  568. for i, char in enumerate(word):
  569. if char == "一" and i + 1 < len(word):
  570. # "一" before tone4 should be yi2, e.g. 一段
  571. if finals[i + 1][-1] == "4":
  572. finals[i] = finals[i][:-1] + "2"
  573. # "一" before non-tone4 should be yi4, e.g. 一天
  574. else:
  575. # "一" 后面如果是标点,还读一声
  576. if word[i + 1] not in self.punc:
  577. finals[i] = finals[i][:-1] + "4"
  578. return finals
  579. def _split_word(self, word: str) -> List[str]:
  580. word_list = jieba.cut_for_search(word)
  581. word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
  582. first_subword = word_list[0]
  583. first_begin_idx = word.find(first_subword)
  584. if first_begin_idx == 0:
  585. second_subword = word[len(first_subword) :]
  586. new_word_list = [first_subword, second_subword]
  587. else:
  588. second_subword = word[: -len(first_subword)]
  589. new_word_list = [second_subword, first_subword]
  590. return new_word_list
  591. def _three_sandhi(self, word: str, finals: List[str]) -> List[str]:
  592. if len(word) == 2 and self._all_tone_three(finals):
  593. finals[0] = finals[0][:-1] + "2"
  594. elif len(word) == 3:
  595. word_list = self._split_word(word)
  596. if self._all_tone_three(finals):
  597. # disyllabic + monosyllabic, e.g. 蒙古/包
  598. if len(word_list[0]) == 2:
  599. finals[0] = finals[0][:-1] + "2"
  600. finals[1] = finals[1][:-1] + "2"
  601. # monosyllabic + disyllabic, e.g. 纸/老虎
  602. elif len(word_list[0]) == 1:
  603. finals[1] = finals[1][:-1] + "2"
  604. else:
  605. finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
  606. if len(finals_list) == 2:
  607. for i, sub in enumerate(finals_list):
  608. # e.g. 所有/人
  609. if self._all_tone_three(sub) and len(sub) == 2:
  610. finals_list[i][0] = finals_list[i][0][:-1] + "2"
  611. # e.g. 好/喜欢
  612. elif (
  613. i == 1
  614. and not self._all_tone_three(sub)
  615. and finals_list[i][0][-1] == "3"
  616. and finals_list[0][-1][-1] == "3"
  617. ):
  618. finals_list[0][-1] = finals_list[0][-1][:-1] + "2"
  619. finals = sum(finals_list, [])
  620. # split idiom into two words who's length is 2
  621. elif len(word) == 4:
  622. finals_list = [finals[:2], finals[2:]]
  623. finals = []
  624. for sub in finals_list:
  625. if self._all_tone_three(sub):
  626. sub[0] = sub[0][:-1] + "2"
  627. finals += sub
  628. return finals
  629. def _all_tone_three(self, finals: List[str]) -> bool:
  630. return all(x[-1] == "3" for x in finals)
  631. # merge "不" and the word behind it
  632. # if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error
  633. def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
  634. new_seg = []
  635. last_word = ""
  636. for word, pos in seg:
  637. if last_word == "不":
  638. word = last_word + word
  639. if word != "不":
  640. new_seg.append((word, pos))
  641. last_word = word[:]
  642. if last_word == "不":
  643. new_seg.append((last_word, "d"))
  644. last_word = ""
  645. return new_seg
  646. # function 1: merge "一" and reduplication words in it's left and right, e.g. "听","一","听" ->"听一听"
  647. # function 2: merge single "一" and the word behind it
  648. # if don't merge, "一" sometimes appears alone according to jieba, which may occur sandhi error
  649. # e.g.
  650. # input seg: [('听', 'v'), ('一', 'm'), ('听', 'v')]
  651. # output seg: [['听一听', 'v']]
  652. def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
  653. new_seg = []
  654. # function 1
  655. for i, (word, pos) in enumerate(seg):
  656. if (
  657. i - 1 >= 0
  658. and word == "一"
  659. and i + 1 < len(seg)
  660. and seg[i - 1][0] == seg[i + 1][0]
  661. and seg[i - 1][1] == "v"
  662. and seg[i + 1][1] == "v"
  663. ):
  664. new_seg[i - 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0]
  665. else:
  666. if (
  667. i - 2 >= 0
  668. and seg[i - 1][0] == "一"
  669. and seg[i - 2][0] == word
  670. and pos == "v"
  671. ):
  672. continue
  673. else:
  674. new_seg.append([word, pos])
  675. seg = new_seg
  676. new_seg = []
  677. # function 2
  678. for i, (word, pos) in enumerate(seg):
  679. if new_seg and new_seg[-1][0] == "一":
  680. new_seg[-1][0] = new_seg[-1][0] + word
  681. else:
  682. new_seg.append([word, pos])
  683. return new_seg
  684. # the first and the second words are all_tone_three
  685. def _merge_continuous_three_tones(
  686. self, seg: List[Tuple[str, str]]
  687. ) -> List[Tuple[str, str]]:
  688. new_seg = []
  689. sub_finals_list = [
  690. lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
  691. for (word, pos) in seg
  692. ]
  693. assert len(sub_finals_list) == len(seg)
  694. merge_last = [False] * len(seg)
  695. for i, (word, pos) in enumerate(seg):
  696. if (
  697. i - 1 >= 0
  698. and self._all_tone_three(sub_finals_list[i - 1])
  699. and self._all_tone_three(sub_finals_list[i])
  700. and not merge_last[i - 1]
  701. ):
  702. # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
  703. if (
  704. not self._is_reduplication(seg[i - 1][0])
  705. and len(seg[i - 1][0]) + len(seg[i][0]) <= 3
  706. ):
  707. new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
  708. merge_last[i] = True
  709. else:
  710. new_seg.append([word, pos])
  711. else:
  712. new_seg.append([word, pos])
  713. return new_seg
  714. def _is_reduplication(self, word: str) -> bool:
  715. return len(word) == 2 and word[0] == word[1]
  716. # the last char of first word and the first char of second word is tone_three
  717. def _merge_continuous_three_tones_2(
  718. self, seg: List[Tuple[str, str]]
  719. ) -> List[Tuple[str, str]]:
  720. new_seg = []
  721. sub_finals_list = [
  722. lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
  723. for (word, pos) in seg
  724. ]
  725. assert len(sub_finals_list) == len(seg)
  726. merge_last = [False] * len(seg)
  727. for i, (word, pos) in enumerate(seg):
  728. if (
  729. i - 1 >= 0
  730. and sub_finals_list[i - 1][-1][-1] == "3"
  731. and sub_finals_list[i][0][-1] == "3"
  732. and not merge_last[i - 1]
  733. ):
  734. # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
  735. if (
  736. not self._is_reduplication(seg[i - 1][0])
  737. and len(seg[i - 1][0]) + len(seg[i][0]) <= 3
  738. ):
  739. new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
  740. merge_last[i] = True
  741. else:
  742. new_seg.append([word, pos])
  743. else:
  744. new_seg.append([word, pos])
  745. return new_seg
  746. def _merge_er(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
  747. new_seg = []
  748. for i, (word, pos) in enumerate(seg):
  749. if i - 1 >= 0 and word == "儿" and seg[i - 1][0] != "#":
  750. new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
  751. else:
  752. new_seg.append([word, pos])
  753. return new_seg
  754. def _merge_reduplication(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
  755. new_seg = []
  756. for i, (word, pos) in enumerate(seg):
  757. if new_seg and word == new_seg[-1][0]:
  758. new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
  759. else:
  760. new_seg.append([word, pos])
  761. return new_seg
  762. def pre_merge_for_modify(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
  763. seg = self._merge_bu(seg)
  764. try:
  765. seg = self._merge_yi(seg)
  766. except:
  767. print("_merge_yi failed")
  768. seg = self._merge_reduplication(seg)
  769. try:
  770. seg = self._merge_continuous_three_tones(seg)
  771. except:
  772. print("_merge_continuous_three_tones failed")
  773. try:
  774. seg = self._merge_continuous_three_tones_2(seg)
  775. except:
  776. print("_merge_continuous_three_tones_2 failed")
  777. seg = self._merge_er(seg)
  778. return seg
  779. def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]:
  780. finals = self._bu_sandhi(word, finals)
  781. finals = self._yi_sandhi(word, finals)
  782. finals = self._neural_sandhi(word, pos, finals)
  783. finals = self._three_sandhi(word, finals)
  784. return finals