1. 序
詞是句子組成的基本單元,不像英語句子已經分好詞了,中文處理的第一步就是中文分詞。
分詞中面臨的三大基本問題
- 分詞規範
- 分詞歧義
- 未登入詞的識别
中文分詞算法大概分為兩大類
第一類:基于字元串比對
即掃描字元串,如果發現字元串的子串和詞相同,就算比對。這類分詞通常會加入一些啟發式規則,比如“正向/反向最大比對”, “長詞優先” 等政策。
優點:速度快,都是O(n)時間複雜度,實作簡單,效果尚可
缺點:就是對歧義和未登入詞處理不好
案例:庖丁解牛分詞器就是基于字元串比對的分詞。
- 歧義的例子很簡單"長春市/長春/藥店"、 "長春/市長/春藥/店"
- 未登入詞即詞典中沒有出現的詞,當然也就處理不好
第二類:基于統計以及機器學習的分詞方式
這類分詞基于人工标注的詞性和統計特征,對中文進行模組化,即根據觀測到的資料(标注好的語料)對模型參數進行估計,即訓練。 在分詞階段再通過模型計算各種分詞出現的機率,将機率最大的分詞結果作為最終結果。常見的序列标注模型有HMM和CRF。
優點:很好處理歧義和未登入詞問題,效果比基于字元串比對效果好
缺點:需要大量的人工标注資料,較慢的分詞速度
案例:Stanford Word Segmenter
2. 基于字元串比對的中文分詞(以前向最大比對為例)
參考代碼
def WordSeg(Inputfile, Outputfile):
f = file(Inputfile)
w = file(Outputfile, 'w')
for line in f:
line = line.strip().decode('utf-8')
senList = []
newsenList = []
tmpword = ''
for i in range(len(line)):
if line[i] in StopWord:
senList.append(tmpword)
senList.append(line[i])
tmpword = ''
else:
tmpword += line[i]
if i == len(line) - 1:
senList.append(tmpword)
#Pre
for key in senList:
if key in StopWord:
newsenList.append(key)
else:
tmplist = PreSenSeg(key, span)
for keyseg in tmplist:
newsenList.append(keyseg)
Prewriteline = ''
for key in newsenList:
Prewriteline = Prewriteline + key + ' '
w.write(Prewriteline.encode('utf-8') + '\n')
f.close()
w.close()
def PreSenSeg(sen, span):
post = span
if len(sen) < span:
post = len(sen)
cur = 0
revlist = []
while 1:
if cur >= len(sen):
break
s = re.search(u"^[0|1|2|3|4|5|6|7|8|9|\uff11|\uff12|\uff13|\uff14|\uff15|\uff16|\uff17|\uff18|\uff19|\uff10|\u4e00|\u4e8c|\u4e09|\u56db|\u4e94|\u516d|\u4e03|\u516b|\u4e5d|\u96f6|\u5341|\u767e|\u5343|\u4e07|\u4ebf|\u5146|\uff2f]+", sen[cur:])
if s:
if s.group() != '':
revlist.append(s.group())
cur = cur + len(s.group())
post = cur + span
if post > len(sen):
post = len(sen)
s = re.search(u"^[a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|\uff41|\uff42|\uff43|\uff44|\uff45|\uff46|\uff47|\uff48|\uff49|\uff47|\uff4b|\uff4c|\uff4d|\uff4e|\uff4f|\uff50|\uff51|\uff52|\uff53|\uff54|\uff55|\uff56|\uff57|\uff58|\uff59|\uff5a|\uff21|\uff22|\uff23|\uff24|\uff25|\uff26|\uff27|\uff28|\uff29|\uff2a|\uff2b|\uff2c|\uff2d|\uff2e|\uff2f|\uff30|\uff31|\uff32|\uff33|\uff35|\uff36|\uff37|\uff38|\uff39|\uff3a]+", sen[cur:])
if s:
if s.group() != '':
revlist.append(s.group())
cur = cur + len(s.group())
post = cur + span
if post > len(sen):
post = len(sen)
if (WordDic.has_key(sen[cur:post])) or (cur + 1 == post):
if sen[cur:post] != '':
revlist.append(sen[cur:post])
cur = post
post = post + span
if post > len(sen):
post = len(sen)
else:
post -= 1
return revlist
注意幾點
- 首先根據标點切開分成小句子,标點絕對是分割的最佳标志。
- 句子中的數字(阿拉伯、漢字、阿拉伯+漢字。注意有十白千萬億)自動檢測出來,不用再切割了。比如1998年=>1998 年
- 句子中的英文單詞直接識别出來,不用分割了。比如:Happy New Year。
後向最大比對與前向思路相同,隻不過切分方向是從後往前。
3. 利用N-gram進行中文分詞
語言模型是根據語言客觀事實而進行的語言抽象數學模組化,是一種對應關系。語言模型與語言客觀事實之間的關系,如同數學上的抽象直線與具體直線之間的關系。
N-gram
語言模型在自然語言進行中占有重要地位,尤其是在基于統計模型的NLP任務中得到了廣泛的應用,目前主要采用的是n元文法模型(N-gram model),這種模型建構簡單、直接,但同時也因為資料缺乏而必須采取平滑算法。
一個語言模型通常建構為字元串s的機率分布p(s),p(s)試圖反映字元串s作為一個句子出現時的頻率。例如一個人所說的100個句子中大約有一句是“OK”,那麼可以任務P(OK)=0.01。而對于句子“the apple eat an chicken”,可以認為其機率為0,因為幾乎沒有人這麼說。與語言學不同,語言模型與句子是否合乎文法沒有關系。對于字串假設有l個基元(基元可以是字、詞、短語等)組成句子,那麼s = w1w2...wl,其機率計算公式為:
p(s)=p(w1)p(w2|w1)p(w3|w1w2).......p(wl|w1w2...pl-1)
把第i個詞wi之前的詞w1w2....wi-1成為wi的“曆史”。随着曆史長度的增加,不同的曆史數目成指數增長。如果曆史長度為i-1,那麼就有Li-1種不同的曆史(L為詞彙集的大小),這樣必須在所有曆史的基礎上得出産生第i個詞的機率。這樣不可能從訓練資料中正确估計出(wi|w1w2...wi-1),并且很多曆史不可能從訓練資料中出現。其中一種比較實際的做法基于這樣的假設:第n個詞的出現隻與前面n-1個詞相關,而與其它任何詞都不相關,整句的機率就是各個詞出現機率的乘積。這些機率可以通過直接從語料中統計N個詞同時出現的次數得到。常用的是二進制的Bi-Gram(隻與前一個詞有關)和三元的Tri-Gram(隻與前兩個詞有關)。
二進制模型為例
p(s) = p(w1|<BEG>)p(w2|w1)p(w3|w2)*****p(wl|wl-1)p(<End>|wl)
其中p(wi|wi-1) = p(wi-1wi)/p(wi-1*)
前邊已經利用前向最大比對和後向最大比對對句子進行了中文分詞。為了提高分詞的準确度,可以利用N-gram比較前向、後向哪個分詞的得到的機率結果更大,就取相應的分詞結果。
4. 小試牛刀
1. 前向後向中文分詞
資料下載下傳:待分詞檔案+對應答案+詞典
代碼
#! -*- coding:utf-8 -*-
import sys
import os
import re
#StopWordtmp = ['。', ',', '!', '?', ':', '“', '”', '‘', '’', '(', ')', '【', '】', '{', '}', '-', '-', '~', '[', ']', '〔', '〕', '.', '@', '¥', '•', '.']
StopWordtmp = [' ', u'\u3000', u'\x30fb', u'\u3002', u'\uff0c', u'\uff01', u'\uff1f', u'\uff1a', u'\u201c', u'\u201d', u'\u2018', u'\u2019', u'\uff08', u'\uff09', u'\u3010', u'\u3011', u'\uff5b', u'\uff5d', u'-', u'\uff0d', u'\uff5e', u'\uff3b', u'\uff3d', u'\u3014', u'\u3015', u'\uff0e', u'\uff20', u'\uffe5', u'\u2022', u'.']
WordDic = {}
StopWord = []
span = 16
def InitStopword():
for key in StopWordtmp:
StopWord.append(key)
def InitDic(Dicfile):
f = file(Dicfile)
for line in f:
line = line.strip().decode('utf-8')
WordDic[line] = 1;
f.close()
print len(WordDic)
print "Dictionary has built down!"
def WordSeg(Inputfile, Outputfile, outputfile2):
f = file(Inputfile)
w = file(Outputfile, 'w')
w2 = file(Outputfile2, 'w')
for line in f:
line = line.strip().decode('utf-8')
senList = []
newsenList = []
tmpword = ''
for i in range(len(line)):
if line[i] in StopWord:
senList.append(tmpword)
senList.append(line[i])
tmpword = ''
else:
tmpword += line[i]
if i == len(line) - 1:
senList.append(tmpword)
#Pre
for key in senList:
if key in StopWord:
newsenList.append(key)
else:
tmplist = PreSenSeg(key, span)
for keyseg in tmplist:
newsenList.append(keyseg)
Prewriteline = ''
for key in newsenList:
Prewriteline = Prewriteline + key + ' '
#Post
newsenList = []
for key in senList:
if key in StopWord:
newsenList.append(key)
else:
tmplist = PostSenSeg(key, span)
for keyseg in tmplist:
newsenList.append(keyseg)
Postwriteline = ''
for key in newsenList:
Postwriteline = Postwriteline + key + ' '
Postwriteline = Postwriteline.strip(' ')
w.write(Prewriteline.encode('utf-8') + '\n')
w2.write(Postwriteline.encode('utf-8') + '\n')
f.close()
w.close()
w2.close()
def PreSenSeg(sen, span):
post = span
if len(sen) < span:
post = len(sen)
cur = 0
revlist = []
while 1:
if cur >= len(sen):
break
s = re.search(u"^[0|1|2|3|4|5|6|7|8|9|\uff11|\uff12|\uff13|\uff14|\uff15|\uff16|\uff17|\uff18|\uff19|\uff10|\u4e00|\u4e8c|\u4e09|\u56db|\u4e94|\u516d|\u4e03|\u516b|\u4e5d|\u96f6|\u5341|\u767e|\u5343|\u4e07|\u4ebf|\u5146|\uff2f]+", sen[cur:])
if s:
if s.group() != '':
revlist.append(s.group())
cur = cur + len(s.group())
post = cur + span
if post > len(sen):
post = len(sen)
s = re.search(u"^[a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|\uff41|\uff42|\uff43|\uff44|\uff45|\uff46|\uff47|\uff48|\uff49|\uff47|\uff4b|\uff4c|\uff4d|\uff4e|\uff4f|\uff50|\uff51|\uff52|\uff53|\uff54|\uff55|\uff56|\uff57|\uff58|\uff59|\uff5a|\uff21|\uff22|\uff23|\uff24|\uff25|\uff26|\uff27|\uff28|\uff29|\uff2a|\uff2b|\uff2c|\uff2d|\uff2e|\uff2f|\uff30|\uff31|\uff32|\uff33|\uff35|\uff36|\uff37|\uff38|\uff39|\uff3a]+", sen[cur:])
if s:
if s.group() != '':
revlist.append(s.group())
cur = cur + len(s.group())
post = cur + span
if post > len(sen):
post = len(sen)
if (WordDic.has_key(sen[cur:post])) or (cur + 1 == post):
if sen[cur:post] != '':
revlist.append(sen[cur:post])
cur = post
post = post + span
if post > len(sen):
post = len(sen)
else:
post -= 1
return revlist
def PostSenSeg(sen, span):
cur = len(sen)
pre = cur - span
if pre < 0:
pre = 0
revlist = []
while 1:
if cur <= 0:
break
s = re.search(u"[0|1|2|3|4|5|6|7|8|9|\uff11|\uff12|\uff13|\uff14|\uff15|\uff16|\uff17|\uff18|\uff19|\uff10|\u4e00|\u4e8c|\u4e09|\u56db|\u4e94|\u516d|\u4e03|\u516b|\u4e5d|\u96f6|\u5341|\u767e|\u5343|\u4e07|\u4ebf|\u5146|\uff2f]+$", sen[pre:cur])
if s:
if s.group() != '':
revlist.append(s.group())
cur = cur - len(s.group())
pre = cur - span
if pre < 0:
pre = 0
s = re.search(u"^[a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|\uff41|\uff42|\uff43|\uff44|\uff45|\uff46|\uff47|\uff48|\uff49|\uff47|\uff4b|\uff4c|\uff4d|\uff4e|\uff4f|\uff50|\uff51|\uff52|\uff53|\uff54|\uff55|\uff56|\uff57|\uff58|\uff59|\uff5a|\uff21|\uff22|\uff23|\uff24|\uff25|\uff26|\uff27|\uff28|\uff29|\uff2a|\uff2b|\uff2c|\uff2d|\uff2e|\uff2f|\uff30|\uff31|\uff32|\uff33|\uff35|\uff36|\uff37|\uff38|\uff39|\uff3a]+", sen[pre:cur])
if s:
if s.group() != '':
revlist.append(s.group())
cur = cur - len(s.group())
pre = cur - span
if pre < 0:
pre = 0
if (WordDic.has_key(sen[pre:cur])) or (cur - 1 == pre):
if sen[pre:cur] != '':
revlist.append(sen[pre:cur])
cur = pre
pre = pre - span
if pre < 0:
pre = 0
else:
pre += 1
return revlist[::-1]
if __name__ == "__main__":
if len(sys.argv) != 5:
print("Usage: python wordseg.py Dicfile Inputfile Outfile")
Dicfile = sys.argv[1]
Inputfile = sys.argv[2]
Outputfile = sys.argv[3]
Outputfile2 = sys.argv[4]
InitDic(Dicfile)
InitStopword()
WordSeg(Inputfile, Outputfile, Outputfile2)
View Code
對比前向後向結果
評測名額
正确率 = 正确識别的個體總數 / 識别出的個體總數
召回率 = 正确識别的個體總數 / 測試集中存在的個體總數
F值 = 正确率 * 召回率 * 2 / (正确率 + 召回率)
評測程式
from __future__ import division
import os
import sys
import linecache
if __name__ == "__main__":
if len(sys.argv) != 3:
print "Usage: python evaluate.py inputfile goldfile"
exit(0)
infile = sys.argv[1]
goldfile = sys.argv[2]
count = 1
count_right = 0
count_split = 0
count_gold = 0
f = file(infile)
for line in f:
inlist = line.strip().decode('utf-8').split(' ')
goldlist = linecache.getline(goldfile, count).strip().decode('utf-8').split(' ')
count += 1
count_split += len(inlist)
count_gold += len(goldlist)
tmp_in = inlist
tmp_gold = goldlist
for key in tmp_in:
if key in tmp_gold:
count_right += 1
tmp_gold.remove(key)
f.close()
print "count_right", count_right
print "count_gold", count_gold
print "count_split", count_split
p = count_right / count_split
r = count_right / count_gold
F = 2 * p * r /(p + r)
print "p:", p
print "r:", r
print "F:", F
結果
2. N-gram中文分詞
資料下載下傳:訓練資料集+測試+答案+字典+評測程式+N-gram分詞
def WordSeg(Inputfile, Outputfile):
f = file(Inputfile)
w = file(Outputfile, 'w')
dic_size = 0
for key in StatisticDic:
for keys in StatisticDic[key]:
dic_size += StatisticDic[key][keys]
for line in f:
line = line.strip().decode('utf-8')
senList = []
newsenList = []
tmpword = ''
for i in range(len(line)):
if line[i] in StopWord:
senList.append(tmpword)
senList.append(line[i])
tmpword = ''
else:
tmpword += line[i]
if i == len(line) - 1:
senList.append(tmpword)
#N-gram
for key in senList:
if key in StopWord:
newsenList.append(key)
else:
Pretmplist = PreSenSeg(key, span)
Posttmplist = PostSenSeg(key, span)
tmp_pre = P(Pretmplist, dic_size)
tmp_post = P(Posttmplist, dic_size)
tmplist = []
if tmp_pre > tmp_post:
tmplist = Pretmplist
else:
tmplist = Posttmplist
#print 'tmplist', tmplist
for keyseg in tmplist:
newsenList.append(keyseg)
writeline = ''
for key in newsenList:
writeline = writeline + key + ' '
writeline = writeline.strip(' ')
w.write(writeline.encode('utf-8') + '\n')
#break
f.close()
w.close()
運作
#! -*- coding:utf-8 -*-
from __future__ import division
import sys
import os
import re
StopWordtmp = [' ', u'\u3000',u'\u3001', u'\u300a', u'\u300b', u'\uff1b', u'\uff02', u'\u30fb', u'\u25ce', u'\x30fb', u'\u3002', u'\uff0c', u'\uff01', u'\uff1f', u'\uff1a', u'\u201c', u'\u201d', u'\u2018', u'\u2019', u'\uff08', u'\uff09', u'\u3010', u'\u3011', u'\uff5b', u'\uff5d', u'-', u'\uff0d', u'\uff5e', u'\uff3b', u'\uff3d', u'\u3014', u'\u3015', u'\uff0e', u'\uff20', u'\uffe5', u'\u2022', u'.']
WordDic = {}
StopWord = []
StatisticDic = {}
span = 16
def InitStopword():
for key in StopWordtmp:
StopWord.append(key)
def InitDic(Dicfile):
f = file(Dicfile)
for line in f:
line = line.strip().decode('utf-8')
WordDic[line] = 1;
f.close()
print len(WordDic)
print "Dictionary has built down!"
def InitStatisticDic(StatisticDicfile):
StatisticDic['<BEG>'] = {}
f = file(StatisticDicfile)
for line in f:
chunk = line.strip().decode('utf-8').split(' ')
if chunk[0] != '':
if not StatisticDic['<BEG>'].has_key(chunk[0]):
StatisticDic['<BEG>'][chunk[0]] = 1
else:
StatisticDic['<BEG>'][chunk[0]] += 1
for i in range(len(chunk) - 1):
if not StatisticDic.has_key(chunk[i]) and chunk[i] != '':
StatisticDic[chunk[i]] = {}
if chunk[i] != '':
if not StatisticDic[chunk[i]].has_key(chunk[i+1]):
StatisticDic[chunk[i]][chunk[i+1]] = 1
else:
StatisticDic[chunk[i]][chunk[i+1]] += 1
if not StatisticDic.has_key(chunk[-1]) and chunk[-1] != '':
StatisticDic[chunk[-1]] = {}
if chunk[-1] != '':
if not StatisticDic[chunk[-1]].has_key('<END>'):
StatisticDic[chunk[-1]]['<END>'] = 1
else:
StatisticDic[chunk[-1]]['<END>'] += 1
def WordSeg(Inputfile, Outputfile):
f = file(Inputfile)
w = file(Outputfile, 'w')
dic_size = 0
for key in StatisticDic:
for keys in StatisticDic[key]:
dic_size += StatisticDic[key][keys]
for line in f:
line = line.strip().decode('utf-8')
senList = []
newsenList = []
tmpword = ''
for i in range(len(line)):
if line[i] in StopWord:
senList.append(tmpword)
senList.append(line[i])
tmpword = ''
else:
tmpword += line[i]
if i == len(line) - 1:
senList.append(tmpword)
#N-gram
for key in senList:
if key in StopWord:
newsenList.append(key)
else:
Pretmplist = PreSenSeg(key, span)
Posttmplist = PostSenSeg(key, span)
tmp_pre = P(Pretmplist, dic_size)
tmp_post = P(Posttmplist, dic_size)
tmplist = []
if tmp_pre > tmp_post:
tmplist = Pretmplist
else:
tmplist = Posttmplist
#print 'tmplist', tmplist
for keyseg in tmplist:
newsenList.append(keyseg)
writeline = ''
for key in newsenList:
writeline = writeline + key + ' '
writeline = writeline.strip(' ')
w.write(writeline.encode('utf-8') + '\n')
#break
f.close()
w.close()
def P(tmplist, dic_size):
rev = 1
if len(tmplist) < 1:
return 0
'''
print 'tmplist', tmplist
print "tmplist[0]", tmplist[0]
print '-----------'
'''
rev *= Pword(tmplist[0], '<BEG>', dic_size)
rev *= Pword('<END>', tmplist[-1], dic_size)
for i in range(len(tmplist)-1):
rev *= Pword(tmplist[i+1], tmplist[i], dic_size)
return rev
def Pword(word1, word2, dic_size):
#print 'word1:', word1
#print 'word2:', word2
div_up = 0
div_down = 0
if StatisticDic.has_key(word2):
for key in StatisticDic[word2]:
#print 'key:', key
div_down += StatisticDic[word2][key]
if key == word1:
div_up = StatisticDic[word2][key]
return (div_up+1) / (div_down + dic_size)
def PreSenSeg(sen, span):
post = span
if len(sen) < span:
post = len(sen)
cur = 0
revlist = []
while 1:
if cur >= len(sen):
break
s = re.search(u"^[0|1|2|3|4|5|6|7|8|9|\uff11|\uff12|\uff13|\uff14|\uff15|\uff16|\uff17|\uff18|\uff19|\uff10|\u4e00|\u4e8c|\u4e09|\u56db|\u4e94|\u516d|\u4e03|\u516b|\u4e5d|\u96f6|\u5341|\u767e|\u5343|\u4e07|\u4ebf|\u5146|\uff2f]+", sen[cur:])
if s:
if s.group() != '':
revlist.append(s.group())
cur = cur + len(s.group())
post = cur + span
if post > len(sen):
post = len(sen)
s = re.search(u"^[a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|\uff41|\uff42|\uff43|\uff44|\uff45|\uff46|\uff47|\uff48|\uff49|\uff47|\uff4b|\uff4c|\uff4d|\uff4e|\uff4f|\uff50|\uff51|\uff52|\uff53|\uff54|\uff55|\uff56|\uff57|\uff58|\uff59|\uff5a|\uff21|\uff22|\uff23|\uff24|\uff25|\uff26|\uff27|\uff28|\uff29|\uff2a|\uff2b|\uff2c|\uff2d|\uff2e|\uff2f|\uff30|\uff31|\uff32|\uff33|\uff35|\uff36|\uff37|\uff38|\uff39|\uff3a]+", sen[cur:])
if s:
if s.group() != '':
revlist.append(s.group())
cur = cur + len(s.group())
post = cur + span
if post > len(sen):
post = len(sen)
if (WordDic.has_key(sen[cur:post])) or (cur + 1 == post):
if sen[cur:post] != '':
revlist.append(sen[cur:post])
cur = post
post = post + span
if post > len(sen):
post = len(sen)
else:
post -= 1
return revlist
def PostSenSeg(sen, span):
cur = len(sen)
pre = cur - span
if pre < 0:
pre = 0
revlist = []
while 1:
if cur <= 0:
break
s = re.search(u"[0|1|2|3|4|5|6|7|8|9|\uff11|\uff12|\uff13|\uff14|\uff15|\uff16|\uff17|\uff18|\uff19|\uff10|\u4e00|\u4e8c|\u4e09|\u56db|\u4e94|\u516d|\u4e03|\u516b|\u4e5d|\u96f6|\u5341|\u767e|\u5343|\u4e07|\u4ebf|\u5146|\uff2f]+$", sen[pre:cur])
if s:
if s.group() != '':
revlist.append(s.group())
cur = cur - len(s.group())
pre = cur - span
if pre < 0:
pre = 0
s = re.search(u"^[a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|\uff41|\uff42|\uff43|\uff44|\uff45|\uff46|\uff47|\uff48|\uff49|\uff47|\uff4b|\uff4c|\uff4d|\uff4e|\uff4f|\uff50|\uff51|\uff52|\uff53|\uff54|\uff55|\uff56|\uff57|\uff58|\uff59|\uff5a|\uff21|\uff22|\uff23|\uff24|\uff25|\uff26|\uff27|\uff28|\uff29|\uff2a|\uff2b|\uff2c|\uff2d|\uff2e|\uff2f|\uff30|\uff31|\uff32|\uff33|\uff35|\uff36|\uff37|\uff38|\uff39|\uff3a]+", sen[pre:cur])
if s:
if s.group() != '':
revlist.append(s.group())
cur = cur - len(s.group())
pre = cur - span
if pre < 0:
pre = 0
if (WordDic.has_key(sen[pre:cur])) or (cur - 1 == pre):
if sen[pre:cur] != '':
revlist.append(sen[pre:cur])
cur = pre
pre = pre - span
if pre < 0:
pre = 0
else:
pre += 1
return revlist[::-1]
if __name__ == "__main__":
if len(sys.argv) != 5:
print("Usage: python wordseg.py Dicfile Inputfile Outfile")
Dicfile = sys.argv[1]
StatisticDicfile = sys.argv[2]
Inputfile = sys.argv[3]
Outputfile = sys.argv[4]
InitDic(Dicfile)
InitStatisticDic(StatisticDicfile)
#print "Dic:", StatisticDic
InitStopword()
WordSeg(Inputfile, Outputfile)
分詞結果
可以看出結果不如前向切分,但高于後向切分。原因是沒有采取平滑政策,利用+1平滑後結果變為
結果超過前向、後向切分,說明有效。