天天看點

Java 漢字轉拼音(完美支援解決多音字)工程結構多音字詞典

        上一篇文章 Java 漢字轉拼音 介紹了Java 中利用Pinyin4j 實作漢字轉拼音,但是對于多音字問題采取的是組合拼音方式,例如長沙 取拼音結果就是 changsha zhangsha。某些情況下我們希望能得到多音字的唯一拼音,此時就需要借助多音字字典了,原理很簡單:給多音字一個預設的拼音并告訴計算機碰到哪些詞的時候使用其它的拼音,例如 長 字,我們可以給它指定預設拼音為 zhang,并辨別 長沙 拼音為 chang。

工程結構

Java 漢字轉拼音(完美支援解決多音字)工程結構多音字詞典

多音字詞典

本類庫 支援自定義擴充詞典,詞典檔案名稱為py4j.dic,完整路徑為:resources/py4j/dictionary/py4j.dic,詞典檔案格式如下:

a#阿
ao#拗口/違拗/拗斷/執拗/拗口/拗口風/拗密碼/拗曲/拗性/拗折/警拗
ai#艾
bang#膀/磅/蚌
ba#扒
bai#叔伯/百/柏楊/㧳/梵呗/呗佛/呗音/呗唱/呗偈/呗聲/呗贊/贊呗
bao#剝皮/薄/暴/堡/曝
bei#呗
beng#蚌埠
bi#複辟/臂/秘魯/泌陽
bing#屏息/屏棄/屏氣/屏除/屏聲
bian#扁/便/便宜坊
bo#薄荷/單薄/伯/泊/波/柏/蘿蔔/孛
bu#蔔/柨
can#參
cang#藏/欌
cen#參差
ceng#曾/噌
cha#差/刹那/寶刹/一刹/查/碴/喳喳/喀喳
chai#公差/差役/專差/官差/聽差/美差/辦差/差事/差使/肥差/當差/欽差/苦差/出差
chan#顫/單于/禅
chang#長/廠
chao#朝/嘲/焯
che#工尺/車
chen#稱職/勻稱/稱心/相稱/對稱
cheng#稱/乘/澄/噌吰/橙 秤/盛滿/盛器/盛飯
chu#畜
chui#椎心
chuai#揣
chuan#傳
chi#匙/尺/吃
chong#重慶/重重/蟲
chou#臭/帱
chuang#經幢
chuo#綽
ci#參差/鱗差/伺候/龜茲
cuan#攢聚/攢動/攢集/攢宮/攢所
cuo#撮兒/撮要/撮合
da#大/嗒
dao#叨/帱載/帱察
dai#大夫
dan#單/彈/撣/澹
dang#铛
de#的/得
di#堤/底/怎的/有的/目的/标的/打的/的确/有的放/的盧/矢之的/言中的/語中的/的士/地/提防/快的/美的
diao#藍調/調調/音調/論調/格調/調令/低調/筆調/基調/強調/聲調/濫調/老調/色調/單調/腔調/跑調/曲調/步調/語調/主調/情調
ding#丁
du#讀/都/度
dou#全都/句讀
duo#舵/測度/忖度/揣度/猜度
dun#糧囤/盾/頓/沌/敦
e#阿谀/阿膠/阿彌/惡/擜
er#兒
fan#番
feng#馮
fei#婔
fo#佛
fu#仿佛/果脯/罘/莩
fou#否
fiao#覅
ga#咖喱/伽馬/嘎/戛納
gai#蓋
gao#告
gang#扛鼎
ge#革/蛤蚧/文蛤/蛤蜊/咯
gei#給
geng#脖頸
gong#女紅/共
gu#谷/中鹄/鼓
gui#龜/櫃/矽/倭傀/傀異/傀然/傀壘/傀怪/傀卓/傀奇/傀偉/傀民/傀俄/琦傀/奇傀
gua#呱
guan#綸巾/東莞
guang#廣
ha#蛤/哈/蝦蟆
hai#還/嗨/咳聲/咳笑
hao#貉子/貉絨
hang#夯/總行/分行/支行/行業/排行/行情/央行/商行/外行/銀行/中行/交行/招行/農行/工行/建行/商行/酒行/麻行/琴行/行業/同行/行列/行貨/行會/行家/巷道/引吭/扼吭/批吭/搤吭/高吭/喉吭/咔吭/絶吭/吭嗌/吭咽/吭首
he#和/合/核/鶴/猲
heng#道行/涥
hu#鹄/水浒/嗀/唬
hua#滑/呚/椛
huan#歸還/放還/奉還/圜
hui#會/浍河/媈/灳/哕/瑗珲
hong#紅/虹
huo#軟和/熱和/暖和
hun#尡/珲
ji#病革/給養/自給/給水/薪給/給予/供給/稽/緝/藉/奇數/亟/诘屈/荠菜/愱
jia#雪茄/伽/家/價/賈/戛
jian#見/淺淺
jiang#降
jiao#嚼舌/嚼字/嚼蠟/角/剿/餃/腳/蕉/矯/睡覺/僥/校對/校驗/校正/校準/審校/校場/校核/校勘/校訂/校閱/校樣
jie#解/慰藉/蘊藉/诘/媘/煯
jin#矜/勁/禁
jing#頸/景/強勁/勁風/勁旅/勁敵/勁射/蒼勁/遒勁/勁草
jiong#炅
ju#咀/居/桔/句/婮
jun#均
juan#棚圈/圈養/豬圈/羊圈
jue#主角/角色/旦角/女角/醜角/角力/名角/配角/嚼/覺/䏐
jun#龜裂/俊
ka#咖/卡/喀
kai#楷
kang#扛
ke#咳/殼
keng#吭
kuai#會計/财會/浍
kui#傀
kuo#括
la#癞痢/臘/蠟
lai#癞瘡/癞子/癞蛤/癞皮
lao#積潦/絡子/落枕/落價/粩/姥
le#樂/勒/了
lei#勒緊
lo#然咯
lou#佝偻/洩露/露面/露臉/露骨/露底/露餡/露一手/露相/露馬腳/露怯
long#裡弄/弄堂/泷
li#跞/礼/櫔/栃
liao#了解/了結/明了/了得/末了/未了/了如/潦/撩
liang#靓/倆
lie#挘
lin#崊
ling#霗/令
liu#六/遛
lu#碌/陸/露
luo#絡/落/漯/囖/洜/泺
lv#率/綠
lve#鋢/稤
lun#綸
ma#嫲/抹布/抹臉/抹桌子/摩挲
mai#埋
man#埋怨/蔓
mai#脈
mang#氓/芒
mao#冒
me#嚒
men#椚
meng#群氓/盟/癦
mei#沒/旀
mo#淹沒/沒收/出沒/沉沒/沒落/吞沒/覆沒/沒入/埋沒/鬼沒/隐沒/湮沒/辱沒/脈脈/模/摩/抹
mou#綢缪/牟
mi#秘/泌尿/分泌/謎/檷枸
mian#渑
ming#掵
miu#謬/謬論/纰缪
mu#大模/字模/模闆/模樣/模具/裝模/模子/牟尼/子牟/夷牟/懸牟/相牟/頭牟/賓牟/曹牟/岑牟/兜牟/盧牟/彌牟/牟食/牟槊/牟衫/牟光/牟牟/牟甲
na#哪/娜/那
nao#臑
nan#南
ne#哪吒/呢
nei#氞
neus#莻
nong#弄/燶
ni#毛呢/花呢/呢絨/線呢/呢料/呢子/呢喃/溺/檷
niao#尿/鳥/便溺
nian#粘膜/粘度/粘土/粘合劑/粘液/粘稠/粘合/粘着/粘結/粘性/粘附/不粘鍋/粘糊/粘蟲/粘聚/粘滞/焾/哖
niang#釀
nin#脌
ning#倿/擰
niu#拗/汼
nu#努
nuo#婀娜/袅娜/喏
nv#女
nve#瘧/硸
o#喔/筽
ou#膒
pa#扒手/扒竊/扒外/扒分/扒糕/扒灰/扒犁/扒龍/扒摟/扒山虎/扒艇
pai#派/迫擊/迫擊炮
pao#刨/炮/萢
pan#番禺
pang#胖/膀/磅
pei#蓜
pi#辟/否極/臧否/龍陂/芘
pian#扁舟/便宜/魸
piao#樸姓/餓莩/饑莩/葭莩
pin#穦
ping#屏/蘋/馮河
po#湖泊/血泊 /迫/樸刀/坡/陂
pu#一曝十寒/裡堡/十裡堡/脯/樸/曝曬/瀑/埔
qi#期/其/泣/祇
qiu#龜茲/湭
qi#稽首/緝鞋/栖/奇/漆/齊
qia#卡脖/卡子/關卡/卡殼/哨卡/邊卡/發夾/峠
qiao#雀盲/雀子/地殼/甲殼/軀殼
qian#纖/乾/淺
qiang#強/㛨/㩖/䅚/䵁
qie#茄/趔趄/聺/籡
qin#親/沁
qing#幹親/親家
qiong#熍
qu#區/趣/爠
quan#圈/券
que#雀/炔
re#聲喏/唱喏
rong#嬫
ruo#若/嵶
saeng#栍
sang#槡
sai#塞/嘥
sao#螦
se#堵塞/搪塞/茅塞/閉塞/鼻塞/梗塞/阻塞/淤塞/擁塞/哽塞/色
sha#莎/刹車/急刹/廈/杉木/杉篙
shai#色子
shao#勺/紅苕
shan#姓單/單縣/杉/敾/禅讓/受禅/禅變/禅代/禅诰
shang#衣裳
she#拾級/折本/射/蛇
shen#沙參/野參/參王/人參/紅參/丹參/山參/海參/鹿參/什麼/身/沈/桑椹/食椹/爛椹/木椹
sheng#野乘/千乘/史乘/省/晟/盛/陹/渑水
shi#鑰匙/什/識/似的/食/石/氏/拾/适/瑡
shiwa#瓧
shuai#表率/率性/率直/率真/粗率/率領/輕率/直率/草率/大率/坦率/衰
shuang#泷水/鏯
shu#屬/數/術/熟
shui#遊說
shuo#數見/說
si#伺/似/思
sou#蓃/摗
su#宿/鯂
sui#尿泡
ta#拓片/拓印/拓本/拓墨/拓寫/拓手/拓工/碑拓/疲沓/拖沓/雜沓/沓/塔/鴻塔
tang#湯/镗
tao#陶
tan#反彈/彈性/彈簧/彈力/彈奏/彈跳/彈指/彈劾/彈唱/彈射/彈性體/吹彈/評彈/亂彈琴/彈壓/彈指/彈簧/彈冠/彈雀/彈雀/彈絲/彈丸/澹台
te#脦
teng#虅
ti#提/體
tiao#調/苕
ting#町/聽
tong#通
tu#迌
tuan#湪
tui#褪
tuo#拓/袥
tun#囤/屯
wei#尾/蔚/圩堤/圩垸/圩田/圩子/趕圩/歌圩
weng#攚
wu#無/可惡/交惡/好惡/厭惡/憎惡/嫌惡/痛惡/深惡/兀
wan#藤蔓/枝蔓/根蔓/蔓草/瓜蔓/蔓兒/莞/萬/百萬/皖
wang#亡
wai#崴
xia#蝦/吓/夏/廈門/廈大/唬殺
xi#栖/系/蹊/洗/溪/戲/焁/銑/褶衣/褶褲
xiao#校/切削/削面/刀削/刮削
xian#纖細/光纖/纖巧/纖柔/纖小/纖維/纖瘦/纖纖/化纖/纖秀/棉纖/纖塵/銑鐵/金銑
xiang#投降/巷
xie#解數/出血/采血/換血/血糊/尿血/淤血/放血/血暈/血淋/便血/吐血/咯血/葉韻/蠍/蠍子/邪/猲猲
xin#嬜/邤
xiu#銅臭/乳臭/成宿/星宿/璓
xin#馨/信/鴻信
xing#深省/省視/内省/不省人事/省悟/省察/行/荥
xiong#匂
xu#牧畜/畜産/畜牧/畜養/并畜/畜銳/籲/圩/浒
xuan#箮
xue#削/血/樰
xun#荨/尋
ya#琊
yao#鑰/耀/曜/佋僥/僥觎/僥僺/僥利/僥傒/僥觊/僥會/僥濫/僥望/僥求/僥競/僥薄/僥躐/僥取/僥奇/僥忝/僥速/僥冀/僥冒/瘧子
yan#咽/殷紅/朱殷/腌/煙/曕
ye#液/抽咽/哽咽/咽炎/嗚咽/幽咽/悲咽/葉/葉/璍/潱/拽步/拽扶/拽紮
yi#自艾/遺/屹/嬄/噫
yin#殷/栶
ying#荥經/緓/灜
yo#杭育
yong#湧/硧
you#牗
yu#餘/呼籲/籲請/籲求/育/熨帖/熨燙/於
yuan#員/茒/圜丘
yun#熨
yue#約/樂音/器樂/樂律/樂章/音樂/樂理/國樂/樂隊/聲樂/奏樂/弦樂/樂壇/管樂/配樂/樂曲/樂譜/鎖鑰/密鑰/樂團/樂器/嬳/咽哕/唾哕/發哕/幹哕/哕吐/哕飯/哕嘔/哕息/哕厥/哕噫/哕逆/哕咽/哕罵/哕心/哕喈/口哕/嘔哕
za#綁紮/結紮/包紮/捆紮/咱家
zan#攢/咱
zang#寶藏/藏曆/藏文/藏語/藏青/藏族/藏醫/藏藥/藏藍/西藏
zai#牛仔/龜仔/龍仔/鼻仔/羊仔/仔仔/麻仔/麵包仔/麥旺仔/鴻仔/煲仔/福仔/畠
zao#栆
ze#擇
zeng#曾國藩/曾孫/曾祖父/曾祖/曾祖母/曾孫女/曾鞏/囎/缯
zong#綜/繌
zha#紮/柞狹/柞薪/柞子/柞鄂/柞葉/柞撒/槱柞/一柞/五柞宮/五柞/雠柞/芟柞/蠟祭/喳
zhai#宅/夈/擇席/擇菜
zhan#粘
zhang#列車長/行長/村長/鎮長/鄉長/區長/縣長/市長/省長/會長/班長/排長/連長/營長/團長/旅長/師長/軍長/委員長/局長/廳長/所長/部長/組長/生長/長大/長高/長個/
zhao#朝朝/明朝/朝晖/朝夕/朝思/今朝/朝氣/朝三/朝秦/朝霞/鷹爪/龍爪/魔爪/爪牙/着急/着迷/着火/怎麼着/正着/着涼/一着/犯不着/着數/這麼着/犯得着/着慌/着忙/數得着/龍爪槐/嘲哳/嘲惹
zhe#折/着/褶
zhen#殝/椹
zhi#辨別/吱/殖/枝/方祇/後祇/皇祇/黃祇/皇地祇/金祇/祇樹/月氏
zhong#重/種
zhou#粥
zhu#屬意/著/駯
zhua#爪子
zhuai#拽
zhuan#芈月傳/外傳/傳記/自傳/正傳/小傳/評傳/傳略/别傳
zhui#椎/隹
zhuo#執著/着裝/着落/着意/着力/附着/着筆/膠着/着實/衣着/着眼/着想/着重/穿着/執着/着墨/着實/沉着/着陸/着想/着色/焯見/焯爍/輝焯
zhuang#幢房/一幢/幢樓/庒
zi#仔/茲
zu#足
zuo#柞/穝
           

關鍵代碼

Py4j.java      
package com.bytebeats.py4j;

import com.bytebeats.py4j.exception.BadHanYuPinYinException;
import com.bytebeats.py4j.util.StringUtils;
import com.google.common.collect.ArrayListMultimap;
import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;

public class Py4j {
	private ArrayListMultimap<String,String> duoYinZiMap;
	public Py4j(){
		Py4jDictionary.getDefault().init();
		duoYinZiMap = Py4jDictionary.getDefault().getDuoYinZiMap();
	}

	public String[] getPinyin(char ch) {
		try{
			HanyuPinyinOutputFormat outputFormat = new HanyuPinyinOutputFormat();
			outputFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
			outputFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE);
			outputFormat.setVCharType(HanyuPinyinVCharType.WITH_V);

			if(ch>=32 && ch<=125){	//ASCII >=33 ASCII<=125的直接傳回 ,ASCII碼表:http://www.asciitable.com/
				return new String[]{String.valueOf(ch)};
			}
			return PinyinHelper.toHanyuPinyinStringArray(ch, outputFormat);
		} catch (BadHanyuPinyinOutputFormatCombination e) {
			throw new BadHanYuPinYinException(e);
		}

	}

	public String getPinyin(String chinese) {
		if(StringUtils.isEmpty(chinese)){
			return null;
		}
		
		chinese = chinese.replaceAll("[\\.,\\,!·\\!?\\?;\\;\\(\\)()\\[\\]\\:: ]+", " ").trim();

		StringBuilder py_sb = new StringBuilder(32);
		char[] chs = chinese.toCharArray();
		for(int i=0;i<chs.length;i++){
			String[] py_arr = getPinyin(chs[i]);
			if(py_arr==null || py_arr.length<1){
				throw new BadHanYuPinYinException("pinyin array is empty, char:"+chs[i]+",chinese:"+chinese);
			}
			if(py_arr.length==1){
				py_sb.append(convertInitialToUpperCase(py_arr[0]));
			}else if(py_arr.length==2 && py_arr[0].equals(py_arr[1])){
				py_sb.append(convertInitialToUpperCase(py_arr[0]));
			}else{
				String resultPy = null, defaultPy = null;;
				for (String py : py_arr) {
					String left = null;	//向左多取一個字,例如 銀[行]
					if(i>=1 && i+1<=chinese.length()){
						left = chinese.substring(i-1,i+1);
						if(duoYinZiMap.containsKey(py) && duoYinZiMap.get(py).contains(left)){
							resultPy = py;
							break;
						}
					}
					
					String right = null;	//向右多取一個字,例如 [長]沙
					if(i<=chinese.length()-2){
						right = chinese.substring(i,i+2);
						if(duoYinZiMap.containsKey(py) && duoYinZiMap.get(py).contains(right)){
							resultPy = py;
							break;
						}
					}
					
					String middle = null;	//左右各多取一個字,例如 龍[爪]槐
					if(i>=1 && i+2<=chinese.length()){
						middle = chinese.substring(i-1,i+2);
						if(duoYinZiMap.containsKey(py) && duoYinZiMap.get(py).contains(middle)){
							resultPy = py;
							break;
						}
					}
					String left3 = null;	//向左多取2個字,如 芈月[傳],列車長
					if(i>=2 && i+1<=chinese.length()){
						left3 = chinese.substring(i-2,i+1);
						if(duoYinZiMap.containsKey(py) && duoYinZiMap.get(py).contains(left3)){
							resultPy = py;
							break;
						}
					}
					
					String right3 = null;	//向右多取2個字,如 [長]孫無忌
					if(i<=chinese.length()-3){
						right3 = chinese.substring(i,i+3);
						if(duoYinZiMap.containsKey(py) && duoYinZiMap.get(py).contains(right3)){
							resultPy = py;
							break;
						}
					}
					
					if(duoYinZiMap.containsKey(py) && duoYinZiMap.get(py).contains(String.valueOf(chs[i]))){	//預設拼音
						defaultPy = py;
					}
				}
				
				if(StringUtils.isEmpty(resultPy)){
					if(StringUtils.isNotEmpty(defaultPy)){
						resultPy = defaultPy;
					}else{
						resultPy = py_arr[0];
					}
				}
				py_sb.append(convertInitialToUpperCase(resultPy));
			}
		}
		
		return py_sb.toString();
	}
	
	private String convertInitialToUpperCase(String str) {
		if (str == null || str.length()==0) {
			return "";
		}
		return str.substring(0, 1).toUpperCase()+str.substring(1);
	}
}

           
Py4jDictionary.java      
package com.bytebeats.py4j;

import com.bytebeats.py4j.util.IoUtils;
import com.bytebeats.py4j.util.StringUtils;
import com.google.common.collect.ArrayListMultimap;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.Enumeration;

/**
 * ${DESCRIPTION}
 *
 * @author Ricky Fung
 * @date 2017-02-16 20:16
 */
public class Py4jDictionary {

    private ArrayListMultimap<String,String> duoYinZiMap;

    private static final String PREFIX = "py4j/dictionary/";

    private static final String CONFIG_NAME = "py4j.dic";

    private static final String PINYIN_SEPARATOR = "#";

    private static final String WORD_SEPARATOR = "/";

    private volatile boolean inited;

    private Py4jDictionary(){

    }

    public void init(){
        if(inited){
            return;
        }
        System.out.println("******start load py4j config******");
        Enumeration<URL> configs = null;
        try{
            String fullName = PREFIX + CONFIG_NAME;
            ClassLoader cl = Thread.currentThread().getContextClassLoader();
            configs = cl.getResources(fullName);
        } catch (Exception e){
            e.printStackTrace();
        }

        this.duoYinZiMap = parse(configs);
        inited = true;

        System.out.println("******load py4j config over******");
        System.out.println("py4j map key size:"+duoYinZiMap.keySet().size());
    }

    private ArrayListMultimap<String,String> parse(Enumeration<URL> configs){
        ArrayListMultimap<String,String> duoYinZiMap = ArrayListMultimap.create(512, 16);
        if(configs!=null){
            while (configs.hasMoreElements()) {
                parseURL(configs.nextElement(), duoYinZiMap);
            }
        }
        return duoYinZiMap;
    }

    private void parseURL(URL url, ArrayListMultimap<String, String> duoYinZiMap){
        System.out.println("parse py4j dictionary:"+url.getPath());
        InputStream in = null;
        BufferedReader br = null;
        try {
            in = url.openStream();
            br = new BufferedReader(new InputStreamReader(in, "UTF-8"));
            String line = null;
            while ((line = br.readLine()) != null) {

                String[] arr = line.split(PINYIN_SEPARATOR);

                if (StringUtils.isNotEmpty(arr[1])) {
                    String[] dyzs = arr[1].split(WORD_SEPARATOR);
                    for (String dyz : dyzs) {
                        if (StringUtils.isNotEmpty(dyz)) {
                            duoYinZiMap.put(arr[0], dyz.trim());
                        }
                    }
                }
            }
        } catch (IOException e) {
            throw new RuntimeException(String.format("load py4j config:%s error", url), e);
        } finally {
            IoUtils.closeQuietly(br);
            IoUtils.closeQuietly(in);
        }
    }

    ArrayListMultimap<String,String> getDuoYinZiMap(){
        return duoYinZiMap;
    }

    public static Py4jDictionary getDefault(){
        return SingletonHolder.INSTANCE;
    }

    private static class SingletonHolder {
        private static final Py4jDictionary INSTANCE = new Py4jDictionary();
    }
}
           

測試用例

package com.bytebeats.py4j;

import org.junit.*;

import java.util.Arrays;

/**
 * Unit test for simple App.
 */
public class Py4jTest {
	private Py4j py4j;

	@Before
	public void init(){
		py4j = new Py4j();
	}

	@Test
	public void testChinesePy() {

		final String[] arr = {"肯德基", "重慶銀行", "長沙銀行", "便宜坊", "西藏", "藏寶圖", "出差", "參加", "列車長"};
		
		for (String chinese : arr){
			String py = py4j.getPinyin(chinese);
			System.out.println(chinese+"\t"+py);
		}
	}
	
	@Test
	public void testCharPy(){

		char[] chs = {'長', '行', '藏', '度', '阿', '佛', '2', 'A', 'a'};
		for(char ch : chs){
			String[] arr_py = py4j.getPinyin(ch);
			System.out.println(ch+"\t"+Arrays.toString(arr_py));
		}
	}

	@After
	public void destroy(){
		py4j = null;
	}
}
           

源代碼下載下傳

py4j:https://github.com/TiFG/py4j