如何使用python 给PDF生成目录
主要步骤:
步骤1.
准备目录的txt,如果PDF是图片无法复制出来,可以使用qq图文识别提取目录内容。
目录txt 格式:
![](https://img.laitimes.com/img/9ZDMuAjOiMmIsIjOiQnIsICM38FdsYkRGZkRG9lcvx2bjxiNx8VZ6l2cs0TPR9EeRpXT3tmaNBDOsJGcohVYsR2MMBjVtJWd0ckW65UbM5WOHJWa5kHT20ESjBjUIF2X0hXZ0xCMx81dvRWYoNHLrdEZwZ1Rh5WNXp1bwNjW1ZUba9VZwlHdssmch1mclRXY39CXldWYtlWPzNXZj9mcw1ycz9WL49zZuBnLxIjM1MjNwATMwITMwEjMwIzLc52YucWbp5GZzNmLn9Gbi1yZtl2Lc9CX6MHc0RHaiojIsJye.png)
关于QQ图片文字提取:
(1)利用qq聊天框中的屏幕识图
(2)转为在线文档
(3)复制目录内容生成一个txt 文件
把页面前面的点去掉
步骤2
执行python 脚本生成目录,会生成一个原文件名-new的PDF文件
import re
import sys
from distutils.version import LooseVersion
from os.path import exists, splitext
from PyPDF2 import PdfFileReader, PdfFileWriter
is_python2 = LooseVersion(sys.version) < '3'
def _get_parent_bookmark(current_indent, history_indent, bookmarks):
'''The parent of A is the nearest bookmark whose indent is smaller than A's
'''
assert len(history_indent) == len(bookmarks)
if current_indent == 0:
return None
for i in range(len(history_indent) - 1, -1, -1):
# len(history_indent) - 1 ===> 0
if history_indent[i] < current_indent:
return bookmarks[i]
return None
def addBookmark(pdf_path, bookmark_txt_path, page_offset):
if not exists(pdf_path):
return "Error: No such file: {}".format(pdf_path)
if not exists(bookmark_txt_path):
return "Error: No such file: {}".format(bookmark_txt_path)
# with open(bookmark_txt_path, 'r', encoding='utf-8') as f:
with open(bookmark_txt_path, 'r', encoding='gbk') as f:
bookmark_lines = f.readlines()
reader = PdfFileReader(pdf_path)
writer = PdfFileWriter()
writer.cloneDocumentFromReader(reader)
maxPages = reader.getNumPages()
bookmarks, history_indent = [], []
# decide the level of each bookmark according to the relative indent size in each line
# no indent: level 1
# small indent: level 2
# larger indent: level 3
# ...
for line in bookmark_lines:
line2 = re.split(r'\s+', unicode(line.strip(), 'utf-8')) if is_python2 else re.split(r'\s+', line.strip())
if len(line2) == 1:
continue
indent_size = len(line) - len(line.lstrip())
parent = _get_parent_bookmark(indent_size, history_indent, bookmarks)
history_indent.append(indent_size)
title, page = ' '.join(line2[:-1]), int(line2[-1]) - 1
if page + page_offset >= maxPages:
return "Error: page index out of range: %d >= %d" % (page + page_offset, maxPages)
new_bookmark = writer.addBookmark(title, page + page_offset, parent=parent)
bookmarks.append(new_bookmark)
out_path = splitext(pdf_path)[0] + '-new.pdf'
with open(out_path,'wb') as f:
writer.write(f)
return "The bookmarks have been added to %s" % out_path
if __name__ == "__main__":
import sys
args = sys.argv
print(args)
pdf_path='./Hive.pdf'
bookmark_txt_path='./format.txt'
page_offset=22
addBookmark(pdf_path,bookmark_txt_path,page_offset)
生成新的带目录的pdf 文件
参考https://www.zhihu.com/question/344805337/answer/819338479