Excel、PDF資料處理

安裝對應的包

處理Excel資料，需要先安裝第三方xlrd/x、xlwt包，通常我們用pip安裝，在目前程式環境下執行指令：

pip install xlrd
pip install xlwt

通過正則比對excel對應列或者行元素值，将對應列行的資料比對成功後插入對應的excel表的行列中去，整體代碼如下：

# coding=utf-8
import re
from xlwt import Workbook
import xlrd
import sys
import os


def excel_process(path):
    _data = xlrd.open_workbook(path)
    table = _data.sheet_by_index(0)
    # 依據下标擷取特定sheet表 table.nrows # 擷取該sheet表行數 table.ncols擷取該sheet表列
    ncols = table.ncols  # 擷取清單的有效列數
    print ncols
    # print table.cell(1, 5).value
    book = Workbook(encoding='utf-8')
    sheet = book.add_sheet('Sheet1')  # 建立一個sheet
    for i in range(table.nrows):
        # k = list(table.row_values(1,1))  # 擷取行數的資料
        # res = str(table.cell(i, 5).value)
        res = table.cell_value(i, 5)
        res2 = table.cell_value(i, 6)
        # res = table.row(i)[5].value
        if res:
            if type(res) == float:
                res = int(res)
                res = unicode(res)
            result = re.findall("1[0-9]{10}", res)
            result = ''.join(result)
        else:
            result = ''
        if res2:
            if type(res2) == float:
                res2 = int(res2)
                res2 = unicode(res2)
            result2 = re.findall("1[0-9]{10}", res2)
            result2 = ''.join(result2)
        else:
            result2 = ''
        print i,result
        if i:
            sheet.write(i, 1, label=result)  # 向第1行第1列寫入擷取到的值
            sheet.write(i, 2, label=result2)  # 向第1行第1列寫入擷取到的值
    book.save("20.xls")
    return "ok"

if __name__ == '__main__':
    path = "2020.xlsx"
    excel_process(path)

python中正規表達式的使用(提取Excel内容)：https://www.runoob.com/python/python-reg-expressions.html

讀取PDF檔案轉為txt檔案，首先需安裝處理PDF的第三方包 1 pip install pdfminer/ 2 pip install pdfminer3k/ 3 pip install pdfminer.six(推薦)，具體代碼如下：

# coding=utf-8
import os, re
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams


# 将一個PDF轉換成txt檔案
def pdfTotxt(filepath, outpath):
    try:
        fp = open(filepath, 'rb')  # 讀資料
        outfp = open(outpath, 'w')  # 寫資料
        # 建立一個PDF資料總管對象來存儲共享資源,caching = False不緩存
        rsrcmgr = PDFResourceManager(caching=False)
        # 建立PDF裝置對象
        laparams = LAParams()
        device = TextConverter(rsrcmgr, outfp, codec='utf-8', laparams=laparams, imagewriter=None)
        # 建立PDF解析器對象
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp, pagenos=set(), maxpages=0,
                                      password='', caching=False, check_extractable=True):
            page.rotate = page.rotate % 360
            interpreter.process_page(page)
        # 關閉輸入流
        fp.close()
        # 關閉輸出流
        device.close()
        outfp.flush()
        outfp.close()
    except Exception as e:
        print("Exception:%s", e)


# 一個檔案夾下的所有pdf文檔轉換成txt
def fileTotxt(fileDir):
    files = os.listdir(fileDir)
    tarDir = fileDir + 'txt'
    if not os.path.exists(tarDir):
        os.mkdir(tarDir)
    replace = re.compile(r'\.pdf', re.I)
    for file in files:
        filePath = fileDir + '\\' + file
        outPath = tarDir + '\\' + re.sub(replace, '', file) + '.txt'
        pdfTotxt(filePath, outPath)
        print("Saved " + outPath)


if __name__ == '__main__':
    pdfTotxt(u'212.pdf', 'test.txt')
    # fileTotxt('path')  # PDF檔案夾路徑

批量将資料庫查詢結果集導出excel

寫入方法，使用調用即可

import openpyxl as xl
import os
import time
def write_excel_file(data, f_name, folder_path='F:\\cxhf\\XCHF\\insurance\\app\\api'):
    result_path = os.path.join(folder_path, f_name)
    # print('***** 開始寫入excel檔案 ' + result_path + ' ***** \n')
    if os.path.exists(result_path):
        #   print('***** excel已存在，在表後添加資料 ' + result_path + ' ***** \n')
        workbook = xl.load_workbook(result_path)
        sheet = workbook.active
    else:
        #  print('***** excel不存在，建立excel ' + result_path + ' ***** \n')
        workbook = xl.Workbook()
        workbook.save(result_path)
        sheet = workbook.active
        headers = ["姓名__", "手機号__", "__", "__", "__", "__", "__",
                   "__", "__", "__", "__"]   #設定檔案列
        sheet.append(headers)
    sheet.append(data)
    workbook.save(result_path)
    return result_path


file_name = str(int(time.time() * 1000)) + '.xlsx'
para = "select * from table " # 資料庫查詢資料集
write_excel_file(para, file_name, folder_path='/root/temp')
url = 'https://xxxxxxxx.oss-cn-hangzhou.aliyuncs.com/' + \
                  upload_img.award_file('/root/temp' + '/' + file_name, file_name) # 傳回上傳至oss存儲位址供下載下傳

Excel、PDF資料處理Excel、PDF資料處理寫入方法，使用調用即可

Excel、PDF資料處理

安裝對應的包

寫入方法，使用調用即可

繼續閱讀

别輕易轉資料分析了！太卷了

IBM Power程式設計馬拉松——以“碼”會友，把手言歡！

python中哪些函數可以進行清單排序？

This application failed to start because it could not find or load the Qt platform plugin "

R語言| 中介效應分析，Mediation包和BruceR包，循環Process函數

一套完整實用的IT規劃方法論

miRNA與轉錄組聯合分析

進階資料分析師憑什麼月薪三萬？一文解答你所有困惑

SparkSQL項目練習1 準備資料2 需求：各區域熱門商品Top3

SQL常見計算方法總結

一篇文章帶你使用模組化的思路解決泰迪杯-智慧政務問題（答複意見評價含代碼）

【資料處理】 python 基于Basemap地理資訊可視化資料可視化方法——Basemap效果

資料分析實戰20絕技

從大資料看技術，為什麼天貓雙11是史上最大數字經濟節日

線上教育巨頭多鄰國Duolingo入華一周年，中國市場馬力全開

【python】【資料處理】畫多元資料分布圖