天天看點

用python做含有中文的正規表達式模式比對

#!/usr/bin/python

#-*- coding:gbk-*-

\'\'\'

spec:根據是否命中126W人名,将usrdict分為兩個部分

parms:

[IN] 

[IN]

[OUT]

author: [email protected] date 20120808

\'\'\'

import re;

import sys;

def LoadKeys(filename):

    \'\'\'

    加載key到記憶體

    \'\'\'

    keys=[];

    p=re.compile(\'^\s+|\s+$\');

    fid=file(filename,"r");

    temp=fid.readlines();

    fid.close();

    for line in temp:

        line=p.sub(\'\',line);

        keys.append(line);

    return keys;

def PrintUsage():

    print \'program [IN] keywords.txt [IN]file.txt [OUT] matched.txt [OUT] notmatched.txt [OUT] ufuwfoverflow\';

    exit(1);

if(__name__=="__main__"):

    delim="\t";

    p=re.compile("(^\\s+|\\s+$)");

    if(len(sys.argv)!=6):

        PrintUsage();

    keyfile=str(sys.argv[1]);

    keys=LoadKeys(keyfile);

    print len(keys);

    inputfile=str(sys.argv[2]);

    outputfile1=str(sys.argv[3]);

    outputfile2=str(sys.argv[4]);

    outputfile3=str(sys.argv[5]);

    fout1=open(outputfile1,\'w\');

    fout2=open(outputfile2,\'w\');

    fout3=open(outputfile3,\'w\');

    fid=open(inputfile,"r");

    linecount=0;

    while True:

        line=fid.readline();

        flag=0;

        if(0==len(line)):

            break;

        line=p.sub(\'\',line);

        if(\'\'==line):

            continue;

        if(0==linecount%100000):

            print \'語料已經處理%d行\'%linecount;

            linecount=linecount+1;

        linesegs=line.split("\t");

        if(4!=len(linesegs)):

            continue;

        if(int(linesegs[2])<=0 or int(linesegs[3])<=0):

            fout3.write(line);

            fout3.write("\n");

            continue;

        try:

            useg=unicode(linesegs[0],\'gbk\');

            count=0;

            for key in keys:

                if(0==count%100000):

                    print \'模式已經掃描%d個\'%count;

                count=count+1;

                patternstr="(^"+key+"|"+key+"$)";

                try:

                    upatternstr=unicode(patternstr,"gbk");

                    pattern=re.compile(upatternstr);

                    if(pattern.search(useg)):

                        print line;

                        flag=1;

                        linesegs.append(key)

                        newline=delim.join(linesegs);

                        fout1.write(newline);

                        fout1.write("\n");

                        break;

                except UnicodeDecodeError:

                    pass;

        except:

            pass;

        if(flag==0):

            linesegs.append("_");

            newline=delim.join(linesegs);

            fout2.write(newline);

            fout2.write("\n");

    fid.close();

    fout1.close();

    fout2.close();

    fout3.close();

用python做含有中文的正規表達式模式比對