天天看點

【Python】反向索引

代碼連結

https://github.com/Birdy-C/Shakespeare-search-engine

預處理

word stemming

一個單詞可能不同的形式,在英語中比如動詞的主被動、單複數等。比如live\lives\lived.

雖然英文的處理看起來已經很複雜啦但實際在中文裡的處理要更加複雜的多。

stop words

比如a、the這種詞在處理的時候沒有實際意義。在這裡處理的時候先對詞頻進行統計,人為界定停詞,簡單的全部替換為空格。但是這種方式并不适用于所有的情況,對于比如,To be or not to be,這種就很難處理。

具體實作

Index.txt 記錄所出現的檔案

這裡将建立反向索引分為三步

thefile.txt 所有出現過的詞(詞頻由高到低)

stop_word.txt 停詞

data.pkl 所建立的索引

1 count.py 确定停詞

2 index.py 建立反向索引

3 query.py 用于查詢

這裡在建立反向索引的時候隻記錄了出現的檔案名,并沒有記錄在檔案中出現的位置。

圖為count.py生成的詞頻統計

【Python】反向索引

count.py

#-*- coding:utf-8 -*-
'''
@author birdy qian
'''
import sys
from nltk import *                                                                                          #import natural-language-toolkit
from operator import itemgetter                                                                 #for sort

def output_count(fdist):                                                                                #output the relative information
    #vocabulary =fdist.items()
    vocabulary =fdist.items()                                                                           #get all the vocabulary 


    vocabulary=sorted(vocabulary, key=itemgetter(),reverse=True)               #sort the vocabulary in decreasing order
    print vocabulary[:]                                                                              #print top 250 vocabulary and its count on the screen
    print 'drawing plot.....'                                                                               #show process
    fdist.plot( , cumulative=False)                                                              #print the plot

    #output in file
    file_object = open('thefile.txt', 'w')                                                              #prepare the file for writing
    for j in vocabulary:
        file_object.write( j[] + ' ')                                                                      #put put all the vocabulary in decreasing order 
    file_object.close( )                                                                                        #close the file


def pre_file(filename): 
    print("read file %s.txt....."%filename)                                                             #show process
    content = open( str(filename) + '.txt', "r").read()
    content = content.lower()
    for ch in '!"#$%&()*+,-./:;<=>[email protected][\\]^_‘{|}~' :                                            #cancel the punction
        content = content.replace(ch, " ")

    plurals = content.split()                                                                               #split the file at '\n' or ' '

    stemmer = PorterStemmer()                                                                       #prepare for stemming
    singles = [stemmer.stem(plural) for plural in plurals]                                  #handling stemming

    return singles



#main function
def main(): 
    print "read index....."                                                                                 #show process
    input = open('index.txt', 'r')                                                                      #titles that need to be handled
    all_the_file =input.read( )
    file=all_the_file.split()
    input.close()                                                                                               #close the file
    fdist1=FreqDist()                                                                                       #create a new dist

    for x in range( , len(file) ):
        #print file[x]
        txt = pre_file( file[x] )                                                                                   #pre handing the txt

        for words in txt :
            words =words.decode('utf-8').encode(sys.getfilesystemencoding())        #change string typt from utf-8 to gbk
            fdist1[words] +=                                                                                   #add it to the dist



    output_count(fdist1)



#runfile
if __name__ == '__main__': 
    main() 
           

index.py

#-*- coding:utf-8 -*-
'''
@author birdy qian
'''

import sys
import pickle                   
from nltk import *                                                                                          #import natural-language-toolkit
from operator import itemgetter                                                                 #for sort


STOPWORDS = []                                                                                          #grobal variable

def output_index(result):
    #print result

    output = open('data.pkl', 'wb')
    pickle.dump(result, output)                                                                     # Pickle dictionary using protocol 0
    output.close()


def pre_file(filename): 
    global STOPWORDS
    print("read file %s.txt....."%filename)                                                             #show process
    content = open( str(filename) + '.txt', "r").read()
    content = content.lower()
    for ch in '!"#$%&()*+,-./:;<=>[email protected][\\]^_��{|}~' :                                           #cancel the punction
        content = content.replace(ch, " ")

    for ch in  STOPWORDS:                                                                               #cancel the stopwords
        content = content.replace(ch, " ")      

    plurals = content.split()                                                                               #split the file at '\n' or ' '

    stemmer = PorterStemmer()                                                                       #prepare for stemming
    singles = [stemmer.stem(plural) for plural in plurals]                                  #handling stemming

    return singles

def readfile(filename):
    input = open(filename, 'r')                                                                     #titles that need to be handled
    all_the_file =input.read( )
    words = all_the_file.split()                                                                            #split the file at '\n' or ' '
    input.close()           
    return words



#main function
def main(): 
    global STOPWORDS
    print "read index....."                                                                                 #show process
    file=readfile('index.txt')
    print "read stopwords....." 
    STOPWORDS = readfile('stop_word.txt')  

    print "create word list....."
    word = list(readfile('thefile.txt'))                                                                        #the file with all the words in all the books
    result = {}                                                                                                     #memorize the result 

    for x in range( , len(file) ):
        #print file[x]

        txt = pre_file( file[x] )                                                                                   # file[x] is the title 
        txt =  {}.fromkeys(txt).keys()                                                                      #cancel the repeat word
        #can also use text.set()                                                            

        for words in txt :
            words =words.decode('utf-8').encode(sys.getfilesystemencoding())        #change string typt from utf-8 to gbk
            if result.get(words) == None :                                                              #if the word is not in the dictionary
                result[words]=[file[x]]
            else:                                                                                                       #if the word is in the dictionary
                t=result.get(words)
                t.append(file[x])
                result[words]=t


    output_index(result)



#runfile
if __name__ == '__main__': 
    main()
           

query.py

#-*- coding:utf-8 -*-
'''
@author birdy qian
'''
import os 
import sys
import pprint, pickle
from nltk import PorterStemmer

def readfile(filename):
    input = open(filename, 'r')                                                                 #titles that need to be handled
    all_the_file =input.read( )
    words = all_the_file.split()                                                                        #split the file at '\n' or ' '
    input.close()                                                                                           #close the data
    return words

def getdata():
    pkl_file = open('data.pkl', 'rb')                                                               #index is saved in the file 'data.pkl'
    data1 = pickle.load(pkl_file)                                                                   #change the type
    #pprint.pprint(data1)
    pkl_file.close()                                                                                        #close the file
    return  data1                                                                                       #close the data

def output( result ):
    #print result
    if result == None:                                              #if the words is not in the index (one word return None)
        print None
        return
    if len(result) ==  :                                           #if the words is not in the index (more than one words return [] )
        print None
        return 

    if len(result) <  :                                               #if the records is less than 10
        print result

    else:                                                                   #if the records is more than 10
        print 'get '+ str(len(result)) + ' records'                                                                         #the record number
        for i in range(  , len(result) /  +):
            print '10 records start from ' +str(i*+)

            if  * i +  < len(result) :                                                                                           #print from 10 * i to 10 * i + 10
                print result[  * i :  * i +  ]
            else:                                                                                                                           #print from 10 * i to end
                print result[  * i :  len(result) ]
                break
            getstr = raw_input("Enter 'N' for next ten records & other input to quit : ")
            if getstr != 'N':
                break



#main function
def main(): 
    data_list = getdata()                                                                                                   #read data                                                                  
    STOPWORDS = readfile('stop_word.txt') 
    stemmer = PorterStemmer()                                                                                       #prepare for stemming

    while True:
        get_str = raw_input("Enter your query('\\'to quit): ")
        if get_str == '\\' :                                                                                                    #leave the loop
            break

        get_str = get_str.lower()
        for ch in  STOPWORDS:                                                                                           #cancel the stopwords
            get_str = get_str.replace(ch, " ")  
        query_list=get_str.split()                                                                                          #split the file at '\n' or ' '
        query_list = [stemmer.stem(plural) for plural in query_list]                                        #handling stemming


        while True:     
            if query_list != [] :
                break
            get_str = raw_input("Please enter more information: ")
            get_str = get_str.lower()
            for ch in  STOPWORDS:                                                                                       #cancel the stopwords
                 get_str = get_str.replace(ch, " ") 
            query_list=get_str.split()
            query_list = [stemmer.stem(plural) for plural in query_list]                                    #handling stemming



        result=[]
        for k in range(  , len(query_list) ):  
            if k==:                                                                                                            #if the list has not been built 
                result = data_list.get( query_list[] )
            else:                                                                                                                   #if the list has been built 
                result = list( set(result).intersection(data_list.get( query_list[k] ) ) )
        output( result )


#runfile
if __name__ == '__main__': 
    main()