Excel、PDF資料處理
安裝對應的包
處理Excel資料,需要先安裝第三方xlrd/x、xlwt包,通常我們用pip安裝,在目前程式環境下執行指令:
pip install xlrd
pip install xlwt
通過正則比對excel對應列或者行元素值,将對應列行的資料比對成功後插入對應的excel表的行列中去,整體代碼如下:
# coding=utf-8
import re
from xlwt import Workbook
import xlrd
import sys
import os
def excel_process(path):
_data = xlrd.open_workbook(path)
table = _data.sheet_by_index(0)
# 依據下标擷取特定sheet表 table.nrows # 擷取該sheet表行數 table.ncols擷取該sheet表列
ncols = table.ncols # 擷取清單的有效列數
print ncols
# print table.cell(1, 5).value
book = Workbook(encoding='utf-8')
sheet = book.add_sheet('Sheet1') # 建立一個sheet
for i in range(table.nrows):
# k = list(table.row_values(1,1)) # 擷取行數的資料
# res = str(table.cell(i, 5).value)
res = table.cell_value(i, 5)
res2 = table.cell_value(i, 6)
# res = table.row(i)[5].value
if res:
if type(res) == float:
res = int(res)
res = unicode(res)
result = re.findall("1[0-9]{10}", res)
result = ''.join(result)
else:
result = ''
if res2:
if type(res2) == float:
res2 = int(res2)
res2 = unicode(res2)
result2 = re.findall("1[0-9]{10}", res2)
result2 = ''.join(result2)
else:
result2 = ''
print i,result
if i:
sheet.write(i, 1, label=result) # 向第1行第1列寫入擷取到的值
sheet.write(i, 2, label=result2) # 向第1行第1列寫入擷取到的值
book.save("20.xls")
return "ok"
if __name__ == '__main__':
path = "2020.xlsx"
excel_process(path)
python中正規表達式的使用(提取Excel内容):https://www.runoob.com/python/python-reg-expressions.html
讀取PDF檔案轉為txt檔案,首先需安裝處理PDF的第三方包 1 pip install pdfminer/ 2 pip install pdfminer3k/ 3 pip install pdfminer.six(推薦),具體代碼如下:
# coding=utf-8
import os, re
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
# 将一個PDF轉換成txt檔案
def pdfTotxt(filepath, outpath):
try:
fp = open(filepath, 'rb') # 讀資料
outfp = open(outpath, 'w') # 寫資料
# 建立一個PDF資料總管對象來存儲共享資源,caching = False不緩存
rsrcmgr = PDFResourceManager(caching=False)
# 建立PDF裝置對象
laparams = LAParams()
device = TextConverter(rsrcmgr, outfp, codec='utf-8', laparams=laparams, imagewriter=None)
# 建立PDF解析器對象
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp, pagenos=set(), maxpages=0,
password='', caching=False, check_extractable=True):
page.rotate = page.rotate % 360
interpreter.process_page(page)
# 關閉輸入流
fp.close()
# 關閉輸出流
device.close()
outfp.flush()
outfp.close()
except Exception as e:
print("Exception:%s", e)
# 一個檔案夾下的所有pdf文檔轉換成txt
def fileTotxt(fileDir):
files = os.listdir(fileDir)
tarDir = fileDir + 'txt'
if not os.path.exists(tarDir):
os.mkdir(tarDir)
replace = re.compile(r'\.pdf', re.I)
for file in files:
filePath = fileDir + '\\' + file
outPath = tarDir + '\\' + re.sub(replace, '', file) + '.txt'
pdfTotxt(filePath, outPath)
print("Saved " + outPath)
if __name__ == '__main__':
pdfTotxt(u'212.pdf', 'test.txt')
# fileTotxt('path') # PDF檔案夾路徑
批量将資料庫查詢結果集導出excel
寫入方法,使用調用即可
import openpyxl as xl
import os
import time
def write_excel_file(data, f_name, folder_path='F:\\cxhf\\XCHF\\insurance\\app\\api'):
result_path = os.path.join(folder_path, f_name)
# print('***** 開始寫入excel檔案 ' + result_path + ' ***** \n')
if os.path.exists(result_path):
# print('***** excel已存在,在表後添加資料 ' + result_path + ' ***** \n')
workbook = xl.load_workbook(result_path)
sheet = workbook.active
else:
# print('***** excel不存在,建立excel ' + result_path + ' ***** \n')
workbook = xl.Workbook()
workbook.save(result_path)
sheet = workbook.active
headers = ["姓名__", "手機号__", "__", "__", "__", "__", "__",
"__", "__", "__", "__"] #設定檔案列
sheet.append(headers)
sheet.append(data)
workbook.save(result_path)
return result_path
file_name = str(int(time.time() * 1000)) + '.xlsx'
para = "select * from table " # 資料庫查詢資料集
write_excel_file(para, file_name, folder_path='/root/temp')
url = 'https://xxxxxxxx.oss-cn-hangzhou.aliyuncs.com/' + \
upload_img.award_file('/root/temp' + '/' + file_name, file_name) # 傳回上傳至oss存儲位址供下載下傳