Word直接导出的pdf不够清晰,使用打印导出的pdf又不带书签以及目录跳转功能这一问题,查阅网上资料使用Adobe DC似乎能够解决但是下载安装比较麻烦,于是写了python程序解决该问题。
解决思路: 使用python脚本对两个pdf文件进行合并,合并小体积的带书签的pdf和高清版本的pdf文件。
Step1:准备带书签的pdf,并命名为999.pdf (该名称与程序对应)
使用word导出功能,选择最小文件导出,并勾选书签导出选项。
Step2:打印输出高清pdf文件,并命名为a.pdf
打印使用福昕pdf的虚拟打印机,最高支持2400dpi的清晰度,我一般选1200dpi,已经足够清晰了。(也可以使用自带的Microsoft pdf 打印输出 但最高只支持600dpi)
打印导出文件,并命名为a.pdf
如果遇到导出的pdf文件太大可以不勾选word选项中的“不压缩文件中的图像”来进一步限制打印输出文件大小。
###使用Microsoft pdf打印输出文件也可以用代码替代执行,但最高只有600dpi,不如福昕pdf
import win32com.client
from pathlib import Path
import os
import timedef convert_docx_to_hd_pdf(input_docx, output_pdf):# 确保输出路径存在output_path = Path(output_pdf).parentos.makedirs(output_path, exist_ok=True)# 创建Word应用对象word = win32com.client.DispatchEx("Word.Application")word.Visible = Truetry:# 打开文档doc = word.Documents.Open(str(Path(input_docx).resolve()))# 获取文档总页数total_pages = doc.ComputeStatistics(2) # 2 = wdStatisticPagesprint(f"文档共 {total_pages} 页,开始转换...")# 输出路径output_pdf_abs = str(Path(output_pdf).resolve())# 设置打印机为Microsoft Print to PDFword.ActivePrinter = "Microsoft Print to PDF"# 设置打印选项word.Options.PrintBackground = Trueword.Application.Options.PrintDraft = Falseword.Application.Options.PrintProperties = False# 执行打印print("开始高质量打印...")word.ActiveDocument.PrintOut(OutputFileName=output_pdf_abs,Range=0,Item=0,Copies=1,Pages="",PageType=0,PrintToFile=True,Collate=False)# 等待打印完成print("正在等待打印完成...")# 等待文件生成max_wait = 30 # 最多等待30秒start_time = time.time()while time.time() - start_time < max_wait:if os.path.exists(output_pdf) and os.path.getsize(output_pdf) > 0:print(f"PDF导出完成: {output_pdf}")print(f"文件大小: {os.path.getsize(output_pdf)} 字节")return Truetime.sleep(1)print("等待超时,请检查输出文件")return Falseexcept Exception as e:print(f"转换失败: {str(e)}")return Falsefinally:try:doc.Close(SaveChanges=False)except:password.Quit()# 执行转换
if __name__ == "__main__":convert_docx_to_hd_pdf('88.docx', '999.pdf') ## 88.docx为word文件名
总之会得到两个pdf文件如下
Step3:创建虚拟环境,并执行文件合并
conda create -n xxx python=3.9conda activate xxxcd xxxxxpip install pikepdf pathlib os
然后运行程序,在此之前需在程序中指定目录页码范围,比如我的word目录对应9-10页
修改如下部分
运行
import pikepdf
from pathlib import Path
import osstart_page = 8
end_page = 9def merge_pdfs_with_bookmarks(image_pdf_path, bookmark_pdf_path, output_pdf_path):"""合并两个PDF文件,保留第二个PDF的书签信息和目录页,其他页面使用第一个PDF的内容参数:image_pdf_path: 包含高质量图片的PDF路径bookmark_pdf_path: 包含书签信息的PDF路径output_pdf_path: 输出PDF的路径"""print(f"开始合并PDF文件...")print(f"图片源PDF: {image_pdf_path}")print(f"书签源PDF: {bookmark_pdf_path}")try:# 确保输出目录存在output_dir = Path(output_pdf_path).parentos.makedirs(output_dir, exist_ok=True)# 打开两个PDF文件with pikepdf.open(image_pdf_path) as image_pdf, pikepdf.open(bookmark_pdf_path) as bookmark_pdf:# 检查页数是否一致if len(image_pdf.pages) != len(bookmark_pdf.pages):print(f"警告: 两个PDF的页数不一致! 图片PDF: {len(image_pdf.pages)}页, 书签PDF: {len(bookmark_pdf.pages)}页")print("继续合并,但可能导致书签指向错误的页面")# 创建一个新的PDF,以bookmark_pdf为基础merged_pdf = pikepdf.Pdf.open(bookmark_pdf_path)# 创建页面映射表,记录原始页面和新页面的对应关系page_map = {}# 替换除了第8-9页以外的所有页面for i in range(len(merged_pdf.pages)):# 页码从0开始,所以第9-10页对应索引8-9if (i > end_page or i < start_page) and i < len(image_pdf.pages): # i != 10-12# 保存原始页面的引用old_page = merged_pdf.pages[i]old_objgen = old_page.obj.objgen# 使用正确的方法替换页面# pikepdf不支持直接删除页面,但可以直接替换merged_pdf.pages[i] = pikepdf.Page(image_pdf.pages[i])# 记录页面映射关系page_map[old_objgen[0]] = merged_pdf.pages[i].obj# 修复文档内部链接(目录页链接)print("正在修复文档内部链接...")# 特别处理目录页(第9-10页,索引8-9)for i in [start_page, end_page]:if '/Annots' in merged_pdf.pages[i]:annots = merged_pdf.pages[i]['/Annots']if isinstance(annots, pikepdf.Array):# print(annots)for annot in annots:# 检查是否是链接注释if annot.get('/Subtype') == '/Link':# 处理直接目标if '/Dest' in annot:dest = annot['/Dest']if isinstance(dest, pikepdf.Array) and len(dest) > 0:if hasattr(dest[0], 'objgen'):ref_id = dest[0].objgen[0]if ref_id in page_map:dest[0] = page_map[ref_id]# 处理动作目标elif '/A' in annot and isinstance(annot['/A'], pikepdf.Dictionary) and '/D' in annot['/A']:dest = annot['/A']['/D']if isinstance(dest, pikepdf.Array) and len(dest) > 0:if hasattr(dest[0], 'objgen'):ref_id = dest[0].objgen[0]if ref_id in page_map:dest[0] = page_map[ref_id]# 提取书签信息bookmarks = []if hasattr(bookmark_pdf, 'Root') and '/Outlines' in bookmark_pdf.Root:print("正在提取书签信息...")# 递归提取书签def extract_bookmarks(outline, bookmarks, depth=0):if '/First' not in outline:returncurrent = outline['/First']while True:title = str(current.get('/Title', ''))dest = current.get('/Dest', None)page_num = 0if dest is not None and isinstance(dest, pikepdf.Array) and len(dest) > 0:# 查找目标页面page_ref = dest[0]for i, page in enumerate(bookmark_pdf.pages):# 使用对象ID比较而不是same_as方法if hasattr(page.obj, 'objgen') and hasattr(page_ref, 'objgen'):if page.obj.objgen == page_ref.objgen:page_num = ibreak# 创建书签项bookmark = {'title': title,'page': page_num,'children': []}bookmarks.append(bookmark)# 处理子书签if '/First' in current:extract_bookmarks(current, bookmark['children'], depth + 1)# 移动到下一个书签if '/Next' not in current:breakcurrent = current['/Next']# 提取所有书签extract_bookmarks(bookmark_pdf.Root['/Outlines'], bookmarks)print(f"提取了 {len(bookmarks)} 个顶级书签")# 如果有提取到书签,添加到新PDFif bookmarks:print("正在将书签添加到新PDF...")# 递归创建书签def create_bookmarks(pdf, bookmarks, parent=None):if not bookmarks:return Nonefirst = Nonelast = Noneprev = Nonefor bookmark in bookmarks:# 创建新书签current = pdf.make_indirect(pikepdf.Dictionary({'/Title': pikepdf.String(bookmark['title']),'/Parent': parent}))# 设置目标页面page_idx = bookmark['page']if page_idx < len(pdf.pages):dest = [pdf.pages[page_idx].obj, pikepdf.Name('/Fit')]current['/Dest'] = pdf.make_indirect(pikepdf.Array(dest))# 处理链接关系if first is None:first = currentif prev is not None:prev['/Next'] = currentcurrent['/Prev'] = prevprev = currentlast = current# 处理子书签if bookmark['children']:children_first = create_bookmarks(pdf, bookmark['children'], current)if children_first:current['/First'] = children_first# 找到最后一个子书签children_last = children_firstwhile '/Next' in children_last:children_last = children_last['/Next']current['/Last'] = children_lastcurrent['/Count'] = len(bookmark['children'])return first# 创建书签字典outlines = merged_pdf.make_indirect(pikepdf.Dictionary({'/Type': pikepdf.Name('/Outlines'),'/Count': len(bookmarks)}))# 创建书签树first = create_bookmarks(merged_pdf, bookmarks, outlines)if first:outlines['/First'] = first# 找到最后一个书签last = firstwhile '/Next' in last:last = last['/Next']outlines['/Last'] = last# 添加到PDFmerged_pdf.Root['/Outlines'] = outlinesprint("成功添加书签到新PDF")# 保存合并后的PDFmerged_pdf.save(output_pdf_path)print(f"PDF合并完成! 输出文件: {output_pdf_path}")print(f"文件大小: {os.path.getsize(output_pdf_path)} 字节")return Trueexcept Exception as e:print(f"合并PDF时出错: {str(e)}")import tracebacktraceback.print_exc()return Falseif __name__ == "__main__":# 文件路径image_pdf = ".\a.pdf" # 包含高质量图片的PDFbookmark_pdf = ".\999.pdf" # 包含书签信息的PDFoutput_pdf = ".\out.pdf" # 输出文件# 执行合并merge_pdfs_with_bookmarks(image_pdf, bookmark_pdf, output_pdf)
大功告成
该文件大小一般与高清pdf文件大小相当...
特别说明:该程序对正文中的图跳转未作程序编写。