代碼連結
https://github.com/Birdy-C/Shakespeare-search-engine
預處理
word stemming
一個單詞可能不同的形式,在英語中比如動詞的主被動、單複數等。比如live\lives\lived.
雖然英文的處理看起來已經很複雜啦但實際在中文裡的處理要更加複雜的多。
stop words
比如a、the這種詞在處理的時候沒有實際意義。在這裡處理的時候先對詞頻進行統計,人為界定停詞,簡單的全部替換為空格。但是這種方式并不适用于所有的情況,對于比如,To be or not to be,這種就很難處理。
具體實作
Index.txt 記錄所出現的檔案
這裡将建立反向索引分為三步
thefile.txt 所有出現過的詞(詞頻由高到低)
stop_word.txt 停詞
data.pkl 所建立的索引
1 count.py 确定停詞
2 index.py 建立反向索引
3 query.py 用于查詢
這裡在建立反向索引的時候隻記錄了出現的檔案名,并沒有記錄在檔案中出現的位置。
圖為count.py生成的詞頻統計

count.py
#-*- coding:utf-8 -*-
'''
@author birdy qian
'''
import sys
from nltk import * #import natural-language-toolkit
from operator import itemgetter #for sort
def output_count(fdist): #output the relative information
#vocabulary =fdist.items()
vocabulary =fdist.items() #get all the vocabulary
vocabulary=sorted(vocabulary, key=itemgetter(),reverse=True) #sort the vocabulary in decreasing order
print vocabulary[:] #print top 250 vocabulary and its count on the screen
print 'drawing plot.....' #show process
fdist.plot( , cumulative=False) #print the plot
#output in file
file_object = open('thefile.txt', 'w') #prepare the file for writing
for j in vocabulary:
file_object.write( j[] + ' ') #put put all the vocabulary in decreasing order
file_object.close( ) #close the file
def pre_file(filename):
print("read file %s.txt....."%filename) #show process
content = open( str(filename) + '.txt', "r").read()
content = content.lower()
for ch in '!"#$%&()*+,-./:;<=>[email protected][\\]^_‘{|}~' : #cancel the punction
content = content.replace(ch, " ")
plurals = content.split() #split the file at '\n' or ' '
stemmer = PorterStemmer() #prepare for stemming
singles = [stemmer.stem(plural) for plural in plurals] #handling stemming
return singles
#main function
def main():
print "read index....." #show process
input = open('index.txt', 'r') #titles that need to be handled
all_the_file =input.read( )
file=all_the_file.split()
input.close() #close the file
fdist1=FreqDist() #create a new dist
for x in range( , len(file) ):
#print file[x]
txt = pre_file( file[x] ) #pre handing the txt
for words in txt :
words =words.decode('utf-8').encode(sys.getfilesystemencoding()) #change string typt from utf-8 to gbk
fdist1[words] += #add it to the dist
output_count(fdist1)
#runfile
if __name__ == '__main__':
main()
index.py
#-*- coding:utf-8 -*-
'''
@author birdy qian
'''
import sys
import pickle
from nltk import * #import natural-language-toolkit
from operator import itemgetter #for sort
STOPWORDS = [] #grobal variable
def output_index(result):
#print result
output = open('data.pkl', 'wb')
pickle.dump(result, output) # Pickle dictionary using protocol 0
output.close()
def pre_file(filename):
global STOPWORDS
print("read file %s.txt....."%filename) #show process
content = open( str(filename) + '.txt', "r").read()
content = content.lower()
for ch in '!"#$%&()*+,-./:;<=>[email protected][\\]^_��{|}~' : #cancel the punction
content = content.replace(ch, " ")
for ch in STOPWORDS: #cancel the stopwords
content = content.replace(ch, " ")
plurals = content.split() #split the file at '\n' or ' '
stemmer = PorterStemmer() #prepare for stemming
singles = [stemmer.stem(plural) for plural in plurals] #handling stemming
return singles
def readfile(filename):
input = open(filename, 'r') #titles that need to be handled
all_the_file =input.read( )
words = all_the_file.split() #split the file at '\n' or ' '
input.close()
return words
#main function
def main():
global STOPWORDS
print "read index....." #show process
file=readfile('index.txt')
print "read stopwords....."
STOPWORDS = readfile('stop_word.txt')
print "create word list....."
word = list(readfile('thefile.txt')) #the file with all the words in all the books
result = {} #memorize the result
for x in range( , len(file) ):
#print file[x]
txt = pre_file( file[x] ) # file[x] is the title
txt = {}.fromkeys(txt).keys() #cancel the repeat word
#can also use text.set()
for words in txt :
words =words.decode('utf-8').encode(sys.getfilesystemencoding()) #change string typt from utf-8 to gbk
if result.get(words) == None : #if the word is not in the dictionary
result[words]=[file[x]]
else: #if the word is in the dictionary
t=result.get(words)
t.append(file[x])
result[words]=t
output_index(result)
#runfile
if __name__ == '__main__':
main()
query.py
#-*- coding:utf-8 -*-
'''
@author birdy qian
'''
import os
import sys
import pprint, pickle
from nltk import PorterStemmer
def readfile(filename):
input = open(filename, 'r') #titles that need to be handled
all_the_file =input.read( )
words = all_the_file.split() #split the file at '\n' or ' '
input.close() #close the data
return words
def getdata():
pkl_file = open('data.pkl', 'rb') #index is saved in the file 'data.pkl'
data1 = pickle.load(pkl_file) #change the type
#pprint.pprint(data1)
pkl_file.close() #close the file
return data1 #close the data
def output( result ):
#print result
if result == None: #if the words is not in the index (one word return None)
print None
return
if len(result) == : #if the words is not in the index (more than one words return [] )
print None
return
if len(result) < : #if the records is less than 10
print result
else: #if the records is more than 10
print 'get '+ str(len(result)) + ' records' #the record number
for i in range( , len(result) / +):
print '10 records start from ' +str(i*+)
if * i + < len(result) : #print from 10 * i to 10 * i + 10
print result[ * i : * i + ]
else: #print from 10 * i to end
print result[ * i : len(result) ]
break
getstr = raw_input("Enter 'N' for next ten records & other input to quit : ")
if getstr != 'N':
break
#main function
def main():
data_list = getdata() #read data
STOPWORDS = readfile('stop_word.txt')
stemmer = PorterStemmer() #prepare for stemming
while True:
get_str = raw_input("Enter your query('\\'to quit): ")
if get_str == '\\' : #leave the loop
break
get_str = get_str.lower()
for ch in STOPWORDS: #cancel the stopwords
get_str = get_str.replace(ch, " ")
query_list=get_str.split() #split the file at '\n' or ' '
query_list = [stemmer.stem(plural) for plural in query_list] #handling stemming
while True:
if query_list != [] :
break
get_str = raw_input("Please enter more information: ")
get_str = get_str.lower()
for ch in STOPWORDS: #cancel the stopwords
get_str = get_str.replace(ch, " ")
query_list=get_str.split()
query_list = [stemmer.stem(plural) for plural in query_list] #handling stemming
result=[]
for k in range( , len(query_list) ):
if k==: #if the list has not been built
result = data_list.get( query_list[] )
else: #if the list has been built
result = list( set(result).intersection(data_list.get( query_list[k] ) ) )
output( result )
#runfile
if __name__ == '__main__':
main()