#!/usr/bin/python
#-*- coding:gbk-*-
\'\'\'
spec:根據是否命中126W人名,将usrdict分為兩個部分
parms:
[IN]
[IN]
[OUT]
author: [email protected] date 20120808
\'\'\'
import re;
import sys;
def LoadKeys(filename):
\'\'\'
加載key到記憶體
\'\'\'
keys=[];
p=re.compile(\'^\s+|\s+$\');
fid=file(filename,"r");
temp=fid.readlines();
fid.close();
for line in temp:
line=p.sub(\'\',line);
keys.append(line);
return keys;
def PrintUsage():
print \'program [IN] keywords.txt [IN]file.txt [OUT] matched.txt [OUT] notmatched.txt [OUT] ufuwfoverflow\';
exit(1);
if(__name__=="__main__"):
delim="\t";
p=re.compile("(^\\s+|\\s+$)");
if(len(sys.argv)!=6):
PrintUsage();
keyfile=str(sys.argv[1]);
keys=LoadKeys(keyfile);
print len(keys);
inputfile=str(sys.argv[2]);
outputfile1=str(sys.argv[3]);
outputfile2=str(sys.argv[4]);
outputfile3=str(sys.argv[5]);
fout1=open(outputfile1,\'w\');
fout2=open(outputfile2,\'w\');
fout3=open(outputfile3,\'w\');
fid=open(inputfile,"r");
linecount=0;
while True:
line=fid.readline();
flag=0;
if(0==len(line)):
break;
line=p.sub(\'\',line);
if(\'\'==line):
continue;
if(0==linecount%100000):
print \'語料已經處理%d行\'%linecount;
linecount=linecount+1;
linesegs=line.split("\t");
if(4!=len(linesegs)):
continue;
if(int(linesegs[2])<=0 or int(linesegs[3])<=0):
fout3.write(line);
fout3.write("\n");
continue;
try:
useg=unicode(linesegs[0],\'gbk\');
count=0;
for key in keys:
if(0==count%100000):
print \'模式已經掃描%d個\'%count;
count=count+1;
patternstr="(^"+key+"|"+key+"$)";
try:
upatternstr=unicode(patternstr,"gbk");
pattern=re.compile(upatternstr);
if(pattern.search(useg)):
print line;
flag=1;
linesegs.append(key)
newline=delim.join(linesegs);
fout1.write(newline);
fout1.write("\n");
break;
except UnicodeDecodeError:
pass;
except:
pass;
if(flag==0):
linesegs.append("_");
newline=delim.join(linesegs);
fout2.write(newline);
fout2.write("\n");
fid.close();
fout1.close();
fout2.close();
fout3.close();
