1. 环境安装

安装paddlepaddle

gpu版本

python -m pip install paddlepaddle-gpu -i https://pypi.tuna.tsinghua.edu.cn/simple

cpu版本：

python -m pip install paddlepaddle -i https://pypi.tuna.tsinghua.edu.cn/simple

安装PaddleOCR

pip install "paddleocr>=2.0.1" # Recommend to use version 2.0.1+

其他库

pip install Pillow==9.5.0
pip install fitz==0.0.1.dev2
pip install numpy==1.24.4
pip install PyMuPDF==1.19.0
pip install opencv-python==4.6.0.66

2. 实现代码

代码

import cv2
import fitz
import numpy as np
from PIL import Image
from paddleocr import PaddleOCR, draw_ocrocr = PaddleOCR(use_angle_cls=True, lang="ch", page_num=427)  # page_num=pdf文件页数
img_path = 'data/深度学习进阶自然语言处理.pdf'
result = ocr.ocr(img_path, cls=True)
for idx in range(len(result)):res = result[idx]for line in res:print(line)# draw result
imgs = []
with fitz.open(img_path) as pdf:for pg in range(0, pdf.pageCount):page = pdf[pg]mat = fitz.Matrix(2, 2)pm = page.getPixmap(matrix=mat, alpha=False)# if width or height > 2000 pixels, don't enlarge the imageif pm.width > 2000 or pm.height > 2000:pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False)img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)imgs.append(img)for idx in range(len(result)):# 保存获取的文本with open(f'data/data_txt/text_{idx}.txt', 'w', encoding='utf-8') as f:res = result[idx]image = imgs[idx]boxes = [line[0] for line in res]txts = [line[1][0] for line in res]for line in txts:f.write(line)f.write('\n')scores = [line[1][1] for line in res]im_show = draw_ocr(image, boxes, txts, scores, font_path='doc/fonts/simfang.ttf')im_show = Image.fromarray(im_show)# 保存图片im_show.save('data/images/page_{}.jpg'.format(idx))