import os
import re
# import chardet
curr_dir = os.path.dirname(os.path.abspath(__file__))
org_path = os.path.join(curr_dir, 'org.txt')
words_path = os.path.join(curr_dir, 'words.txt') # 绝对路径的获取
for line in open(org_path, 'r', encoding="utf-8"):
line = re.sub('[^\\na-zA-Z\u4e00-\u9fa5]', '', line) #清洗文档,仅保留汉字与字母
2、make_word_list 函数:
import pypinyin
def pinyin(word):
s = ''
for i in pypinyin.pinyin(word, style=pypinyin.NORMAL):
s += ''.join(i)
return s #将敏感词在pypinyin库中的匹配得到拼音连接组成新的敏感字符串并返回
curr_dir = os.path.dirname(os.path.abspath(__file__))
org_path = os.path.join(curr_dir, 'org.txt')
words_path = os.path.join(curr_dir, 'words.txt') # 绝对路径的获取
words_list = []
with open(words_path, 'r', encoding="utf-8") as file1_object:
lines1 = file1_object.readlines()
for line in lines1:
words_list.append(line.strip()) # 将敏感词存储在列表words_list中
words_list.append(pinyin(line)) #按行读入敏感词并调用pinyin函数添加拓展敏感词
3、sensitive_search 函数:
int_count = 0
ans_path = os.path.join(curr_dir, 'ans.txt')
for line in open(org_path, 'r', encoding="utf-8"):
line = re.sub('[^\\na-zA-Z\u4e00-\u9fa5]', '', line) # 去除文档中除了汉字和字母的其他符号
int_count += 1
for word in words_list:
if word in line:
#print("Line%d:<%s>%s" % (int_count, word, word))
str1 = str("Line%d:<%s>%s\n" % (int_count, word, word))
with open(ans_path, 'r+',encoding="utf-8") as file3_object:
file3_object.write(str1)