diff --git a/chpronounce/chp.py b/chpronounce/chp.py index 753fbaa80..4ecb7d18c 100644 --- a/chpronounce/chp.py +++ b/chpronounce/chp.py @@ -24,6 +24,7 @@ def _append_phrase(self, ph, res): def _append_word(self, word, pos, res): if word in self.dic[1]: for dy, poses in self.dic[1][word]: + if pos in poses: res.append(dy) break @@ -55,7 +56,7 @@ def get_duyin(self, sentence): if len(word) == 1: self._append_word(word, pos, res) else: - if word in self.dic[len(word)]: + if len(word) in self.dic and word in self.dic[len(word)]: self._append_phrase(word, res) else: sub = "" @@ -64,13 +65,14 @@ def get_duyin(self, sentence): self._break_down(sub, pos, res) sub = "" sub += c - if len(sub) > 1 and sub in self.dic[len(sub)]: + if len(word) in self.dic and len(sub) > 1 and sub in self.dic[len(sub)]: self._append_phrase(sub, res) sub = "" self._break_down(sub, pos, res) + if any(c in sentence for c in "不一"): for i in range(len(sentence)): - if any(sentence[i] == c for c in "不一") and i + 1 != len(sentence): + if sentence[i] in "不一" and i + 1 != len(sentence): if res[i + 1][2] == 4: py, zy, tone = res[i] tone = 2 diff --git a/chpronounce/postag b/chpronounce/postag index e7d864d2c..badf16d74 100644 --- a/chpronounce/postag +++ b/chpronounce/postag @@ -2,7 +2,7 @@ n 名词 名 t 时间词 副 s 处所词 副 f 方位词 副 -m 数词 X +m 数词 名 q 量词 量 b 区别词 动 r 代词 名 @@ -22,12 +22,12 @@ j 简称 缀 h 前接成分 缀 k 后接成分 缀 g 语素 名 -x 非语素字 X -w 标点符号 X +x 非语素字 名 +w 标点符号 名 nr 人名 名 ns 地名 名 nt 机构名称 名 -nx 外文字符 X +nx 外文字符 名 nz 其它专名 名 vd 副动词 动 vn 名动词 动 diff --git a/chpronounce/xdic.pkl b/chpronounce/xdic.pkl index 86467fa19..057bcc86e 100644 Binary files a/chpronounce/xdic.pkl and b/chpronounce/xdic.pkl differ diff --git a/verify_dict.py b/verify_dict.py index 147a03dcc..be55de36a 100644 --- a/verify_dict.py +++ b/verify_dict.py @@ -36,7 +36,7 @@ def main(): ignore_flag = True break if ignore_flag: - continue + to_pop.append((word_len, word, 0)) if not isinstance(lis, list): lis = [lis] @@ -80,7 +80,7 @@ def main(): print("t |", "\t ".join(map(str, tones))) print("+++++++++++++++-----------------------------") - if len(cys) != word_len or len(tones) != word_len: + if len(cys) != word_len or len(tones) != word_len or len(pys) != word_len: to_pop.append((word_len, word, iol)) logging.warning(f"{word}[{iol}] is invalid.") continue @@ -90,17 +90,16 @@ def main(): x = [y for y in x if y in CHEWINGS] x = "".join(x) if xx != x: - logging.warning(f"[{word}] {xx} ---> {x}") cys[i] = x ori = dic[word_len][word] + # if word_len == 1: + # pys, cys, tones = pys[0], cys[0], tones[0] if word_len == 1: - pys, cys, tones = pys[0], cys[0], tones[0] - if isinstance(dic[word_len][word], list): - dic[word_len][word][iol] = ((pys, cys, tones), pos) if pos != '' else (pys, cys, tones) + dic[word_len][word][iol] = ((pys, cys, tones), pos) # if pos != '' else (pys, cys, tones) else: - dic[word_len][word] = ((pys, cys, tones), pos) if pos != '' else (pys, cys, tones) + dic[word_len][word] = (pys, cys, tones) # if pos != '' else (pys, cys, tones) if ori != dic[word_len][word]: n_diff += 1 @@ -110,7 +109,8 @@ def main(): # input() for word_len, word, iol in to_pop: - dic[word_len].pop(word) + if word in dic[word_len]: + dic[word_len].pop(word) # summary total = 0