# -*- coding: utf-8 -*-
# @Time : 2020/10/16 14:09 PM
# @Author : yangkaitong
# @FileName: preprocessing.py
######################################################################
## win下bat運作該指令,會将chm轉化為txt,部分會轉化成html ##
## hh -decompile D:\Desktop\ 123.chm ##
## 遺憾的是部分會是html,是以該腳本旨在轉化html為excel ##
######################################################################
import re
import os
import pandas as pd
def normalizing(text:str):
new_text = []
line = text.split("\n")
drop_line_flag = ["Top "," ","Previous ","body","Next"]
replace_line_flag = ["?"]
def drop_flag(cur_line):
for flag in drop_line_flag:
if flag in cur_line:
return True
return False
for cur_line in line:
cur_line = cur_line.strip()
if cur_line == "":
continue
elif drop_flag(cur_line):
continue
elif len(new_text)>0 and cur_line in new_text[-1]:
continue
else:
for flag in replace_line_flag:
cur_line = cur_line.replace(flag,"")
new_text.append(cur_line)
return "".join(new_text)
def html_tag_rm(content: str):
dr = re.compile(r'<[^>]+>',re.S)
return dr.sub('',content)
def read_htm(file):
htmlf = open(file,'r')
htmlcont = htmlf.read()
return htmlcont
def get_text(file):
html = read_htm(file)
text = html_tag_rm(html)
text = normalizing(text)
return text
def htm2csv(path,save_path):
#path html總目錄
#最後儲存的excel檔案路徑
temp_list_text = []
temp_list_filename = []
for file in os.listdir(path):
file_path = os.path.join(path, file)
if os.path.isfile(file_path):
text = get_text(file_path)
if text!="":
temp_list_text.append(text)
temp_list_filename.append(file)
else:
print("%s 不存在"%file_path)
temp_dict = {'file_name': temp_list_filename,'text': temp_list_text}
df = pd.DataFrame(temp_dict)
df.to_excel(save_path)
if __name__ == '__main__':
path = "data_dir"
save_path = r"result.xlsx"
htm2csv(path,save_path)