预览
第1步:理解基本结构和导入必要的库
# 1. 首先导入需要的库
import os # 用于处理文件和路径
import cv2 # 用于图像处理
import numpy as np # 用于数值计算
from paddleocr import PaddleOCR # 用于文字识别
from pdf2image import convert_from_path # 用于PDF转图像
import time # 用于计时
第2步:创建基本类结构
class PDFTextExtractor:def __init__(self):# 初始化OCR工具self.ocr = PaddleOCR(use_angle_cls=True,lang='ch', # 中文识别use_gpu=False, # 不使用GPUshow_log=False # 不显示日志)# 定义要识别的颜色范围(黄色和红色)self.color_ranges = {'yellow': {'lower': np.array([15, 70, 70]),'upper': np.array([35, 255, 255])},'red': {'lower': np.array([0, 70, 70]),'upper': np.array([15, 255, 255])}}
第3步:创建主要处理函数
def process_pdf(self, pdf_path, output_path='extracted_text.txt'):try:# 检查PDF文件是否存在if not os.path.exists(pdf_path):raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")print(f"开始处理PDF: {pdf_path}")start_time = time.time()# 设置poppler路径(需要先安装poppler)poppler_path = r"E:\Proper\poppler-24.08.0\Library\bin"if not os.path.exists(poppler_path):raise Exception(f"Poppler 路径不存在: {poppler_path}")# 获取PDF总页数total_pages = self.get_pdf_page_count(pdf_path, poppler_path)print(f"PDF总页数: {total_pages}")# 处理每一页with open(output_path, 'w', encoding='utf-8') as f:# 处理代码...
第4步:创建图像预处理函数
def preprocess_image(self, pil_image):"""图像预处理函数"""# 1. 调整图像大小pil_image = self.resize_image(pil_image)# 2. 转换为OpenCV格式并预处理img = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR) # 转换颜色空间img = cv2.GaussianBlur(img, (3, 3), 0) # 使用高斯模糊降噪img = cv2.convertScaleAbs(img, alpha=1.2, beta=10) # 调整对比度和亮度return imgdef resize_image(self, image):"""调整图像大小的函数"""width, height = image.sizemax_dimension = 2000 # 设置最大尺寸# 如果图像太大,就等比例缩小if width > max_dimension or height > max_dimension:scale = max_dimension / max(width, height)new_width = int(width * scale)new_height = int(height * scale)return image.resize((new_width, new_height))return image
第5步:创建文本提取函数
def extract_colored_text(self, img, color_lower, color_upper):"""提取特定颜色区域的文本"""try:# 1. 转换为HSV颜色空间(更容易处理颜色)hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)# 2. 创建颜色掩码mask = cv2.inRange(hsv, color_lower, color_upper)# 3. 图像处理优化kernel = np.ones((3, 3), np.uint8)mask = cv2.dilate(mask, kernel, iterations=2) # 膨胀mask = cv2.erode(mask, kernel, iterations=1) # 腐蚀mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel) # 开运算# 4. 提取颜色区域result = cv2.bitwise_and(img, img, mask=mask)# 5. 转换为灰度图gray = cv2.cvtColor(result, cv2.COLOR_BGR2GRAY)gray = cv2.convertScaleAbs(gray, alpha=1.5, beta=10)# 6. OCR文字识别ocr_result = self.ocr.ocr(gray, cls=True)# 7. 处理识别结果texts = []if ocr_result is not None:for line in ocr_result:if line is not None:for word_info in line:if isinstance(word_info, list) and len(word_info) >= 2:text = word_info[1][0].strip()confidence = word_info[1][1]if confidence > 0.6 and text: # 置信度大于0.6的结果texts.append(text)return textsexcept Exception as e:print(f"文本提取错误: {str(e)}")return []
第6步:创建PDF页数获取函数
def get_pdf_page_count(self, pdf_path, poppler_path):"""获取PDF总页数"""try:# 使用pdf2image的方法获取PDF信息from pdf2image.pdf2image import pdfinfo_from_pathinfo = pdfinfo_from_path(pdf_path, poppler_path=poppler_path)return info['Pages']except Exception as e:print(f"获取PDF页数失败: {str(e)}")return 0
第7步:创建主函数
def main():try:# 1. 创建PDF文本提取器实例extractor = PDFTextExtractor()# 2. 设置PDF文件路径pdf_path = r"你的PDF文件路径"# 3. 处理PDFextractor.process_pdf(pdf_path)except Exception as e:print(f"程序执行失败: {str(e)}")# 程序入口
if __name__ == "__main__":main()
使用步骤:
- 安装必要的库:
pip install opencv-python numpy paddleocr pdf2image pillow
-
安装Poppler:
- 下载Poppler并解压
- 将bin目录添加到环境变量
-
修改代码中的路径:
- PDF文件路径
- Poppler路径
-
运行程序
好的,让我详细解释如何一步步实现这个程序:
第8步:环境准备
- 创建项目文件夹
mkdir pdf_extractor
cd pdf_extractor
- 创建虚拟环境(推荐)
python -m venv venv
# Windows激活虚拟环境
venv\Scripts\activate
# Linux/Mac激活虚拟环境
source venv/bin/activate
- 安装所需库
pip install opencv-python
pip install numpy
pip install paddleocr
pip install pdf2image
pip install pillow
第9步:创建项目结构
pdf_extractor/├── main.py # 主程序文件├── requirements.txt # 依赖库列表└── output/ # 输出文件夹
第10步:编写完整代码
在 main.py
中:
import os
import cv2
import numpy as np
from paddleocr import PaddleOCR
from pdf2image import convert_from_path
import timeclass PDFTextExtractor:def __init__(self):# 初始化OCRprint("初始化OCR系统...")self.ocr = PaddleOCR(use_angle_cls=True,lang='ch',use_gpu=False,show_log=False)# 定义颜色范围self.color_ranges = {'yellow': {'lower': np.array([15, 70, 70]),'upper': np.array([35, 255, 255])},'red': {'lower': np.array([0, 70, 70]),'upper': np.array([15, 255, 255])}}print("初始化完成!")def process_pdf(self, pdf_path, output_path='output/extracted_text.txt'):"""处理PDF文件的主函数"""try:# 创建输出目录os.makedirs(os.path.dirname(output_path), exist_ok=True)# 检查文件是否存在if not os.path.exists(pdf_path):raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")print(f"开始处理PDF: {pdf_path}")start_time = time.time()# 处理每一页self._process_pages(pdf_path, output_path)# 完成处理end_time = time.time()print(f"\n处理完成!用时: {end_time - start_time:.2f}秒")print(f"结果已保存到: {output_path}")except Exception as e:print(f"处理过程中出错: {str(e)}")raisedef _process_pages(self, pdf_path, output_path):"""处理PDF的所有页面"""# 获取PDF总页数total_pages = self.get_pdf_page_count(pdf_path)print(f"PDF总页数: {total_pages}")# 打开输出文件with open(output_path, 'w', encoding='utf-8') as f:# 处理每一页for page_num in range(1, total_pages + 1):self._process_single_page(pdf_path, page_num, total_pages, f)
好的,让我们继续完成代码的其余部分:
def _process_single_page(self, pdf_path, page_num, total_pages, output_file):"""处理单个PDF页面"""print(f"\n处理第 {page_num}/{total_pages} 页...")try:# 1. 转换PDF页面为图像pages = convert_from_path(pdf_path,first_page=page_num,last_page=page_num,dpi=200, # 设置分辨率poppler_path=r"E:\Proper\poppler-24.08.0\Library\bin", # 修改为你的poppler路径thread_count=1)if not pages:print(f"警告: 第 {page_num} 页转换失败")return# 2. 获取页面图像page = pages[0]# 3. 预处理图像img = self.preprocess_image(page)# 4. 处理每种颜色page_results = []for color_name, color_range in self.color_ranges.items():print(f"处理{color_name}色文本...")highlighted_text = self.extract_colored_text(img.copy(),color_range['lower'],color_range['upper'])if highlighted_text:page_results.extend(highlighted_text)# 5. 保存结果if page_results:output_file.write(f"\n第{page_num}页标注文本:\n")output_file.write('\n'.join(page_results) + '\n')output_file.flush()print(f"第 {page_num} 页找到 {len(page_results)} 条文本")else:print(f"第 {page_num} 页未找到高亮文本")# 6. 清理内存del pagesdel pagedel imgexcept Exception as e:print(f"处理第 {page_num} 页时出错: {str(e)}")def preprocess_image(self, pil_image):"""图像预处理"""try:# 1. 调整图像大小resized_image = self.resize_image(pil_image)# 2. 转换为OpenCV格式img = cv2.cvtColor(np.array(resized_image), cv2.COLOR_RGB2BGR)# 3. 图像增强img = cv2.GaussianBlur(img, (3, 3), 0) # 降噪img = cv2.convertScaleAbs(img, alpha=1.2, beta=10) # 增加对比度和亮度return imgexcept Exception as e:print(f"图像预处理错误: {str(e)}")raisedef resize_image(self, image):"""调整图像大小"""try:width, height = image.sizemax_dimension = 2000# 如果图像太大,进行缩放if width > max_dimension or height > max_dimension:scale = max_dimension / max(width, height)new_width = int(width * scale)new_height = int(height * scale)return image.resize((new_width, new_height))return imageexcept Exception as e:print(f"图像缩放错误: {str(e)}")raise
使用示例:
def main():try:# 1. 创建输出目录os.makedirs('output', exist_ok=True)# 2. 创建提取器实例print("初始化PDF文本提取器...")extractor = PDFTextExtractor()# 3. 设置PDF文件路径pdf_path = r"你的PDF文件路径" # 修改为你的PDF文件路径output_path = "output/extracted_text.txt"# 4. 处理PDFprint(f"开始处理PDF文件: {pdf_path}")extractor.process_pdf(pdf_path, output_path)except Exception as e:print(f"程序执行失败: {str(e)}")raiseif __name__ == "__main__":main()
使用说明:
-
准备工作:
- 安装所需库
- 安装Poppler并设置路径
- 准备要处理的PDF文件
-
修改配置:
- 修改PDF文件路径
- 修改Poppler路径
- 根据需要调整颜色范围
-
运行程序:
python main.py
- 查看结果:
- 输出文件将保存在output目录下
- 程序会显示处理进度和结果
完整项目代码
import os
import cv2
import numpy as np
from paddleocr import PaddleOCR
from pdf2image import convert_from_path
import timeclass PDFTextExtractor:def __init__(self):self.ocr = PaddleOCR(use_angle_cls=True,lang='ch',use_gpu=False,show_log=False)self.color_ranges = {'yellow': {'lower': np.array([15, 70, 70]),'upper': np.array([35, 255, 255])},'red': {'lower': np.array([0, 70, 70]),'upper': np.array([15, 255, 255])}}def process_pdf(self, pdf_path, output_path='extracted_text.txt'):try:if not os.path.exists(pdf_path):raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")print(f"开始处理PDF: {pdf_path}")start_time = time.time()poppler_path = r"E:\Proper\poppler-24.08.0\Library\bin"if not os.path.exists(poppler_path):raise Exception(f"Poppler 路径不存在: {poppler_path}")# 获取PDF总页数total_pages = self.get_pdf_page_count(pdf_path, poppler_path)print(f"PDF总页数: {total_pages}")with open(output_path, 'w', encoding='utf-8') as f:for page_num in range(1, total_pages + 1):print(f"\n处理第 {page_num}/{total_pages} 页...")try:pages = convert_from_path(pdf_path,first_page=page_num,last_page=page_num,dpi=200,poppler_path=poppler_path,thread_count=1)if not pages:print(f"警告: 第 {page_num} 页转换失败")continuepage = pages[0]# 转换和预处理图像img = self.preprocess_image(page)# 处理每种颜色page_results = []for color_name, color_range in self.color_ranges.items():print(f"处理{color_name}色文本...")highlighted_text = self.extract_colored_text(img.copy(), # 使用图像副本color_range['lower'],color_range['upper'])if highlighted_text:page_results.extend(highlighted_text)# 保存结果if page_results:f.write(f"\n第{page_num}页标注文本:\n")f.write('\n'.join(page_results) + '\n')f.flush()print(f"第 {page_num} 页找到 {len(page_results)} 条文本")else:print(f"第 {page_num} 页未找到高亮文本")# 清理内存del pagesdel pagedel imgexcept Exception as e:print(f"处理第 {page_num} 页时出错: {str(e)}")continueend_time = time.time()print(f"\n处理完成!用时: {end_time - start_time:.2f}秒")print(f"结果已保存到: {output_path}")except Exception as e:print(f"处理过程中出错: {str(e)}")raisedef preprocess_image(self, pil_image):"""图像预处理"""# 调整大小pil_image = self.resize_image(pil_image)# 转换为OpenCV格式并预处理img = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)img = cv2.GaussianBlur(img, (3, 3), 0) # 降噪img = cv2.convertScaleAbs(img, alpha=1.2, beta=10) # 增加对比度和亮度return imgdef get_pdf_page_count(self, pdf_path, poppler_path):"""获取PDF页数"""try:pages = convert_from_path(pdf_path,dpi=72,poppler_path=poppler_path,first_page=1,last_page=1)# 使用 pdf2image 的方法获取总页数from pdf2image.pdf2image import pdfinfo_from_pathinfo = pdfinfo_from_path(pdf_path, poppler_path=poppler_path)return info['Pages']except Exception as e:print(f"获取PDF页数失败: {str(e)}")return 0def resize_image(self, image):"""调整图像大小"""width, height = image.sizemax_dimension = 2000 # 增加最大尺寸以提高识别率if width > max_dimension or height > max_dimension:scale = max_dimension / max(width, height)new_width = int(width * scale)new_height = int(height * scale)return image.resize((new_width, new_height))return imagedef extract_colored_text(self, img, color_lower, color_upper):"""提取特定颜色区域的文本"""try:# 转换颜色空间hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)# 创建掩码mask = cv2.inRange(hsv, color_lower, color_upper)# 形态学操作kernel = np.ones((3, 3), np.uint8)mask = cv2.dilate(mask, kernel, iterations=2)mask = cv2.erode(mask, kernel, iterations=1)mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel)# 应用掩码result = cv2.bitwise_and(img, img, mask=mask)# 转换为灰度图gray = cv2.cvtColor(result, cv2.COLOR_BGR2GRAY)# 增强对比度gray = cv2.convertScaleAbs(gray, alpha=1.5, beta=10)# 保存处理后的图像用于调试# cv2.imwrite(f'debug_page_{time.time()}.png', gray)# OCR识别ocr_result = self.ocr.ocr(gray, cls=True)# 处理OCR结果texts = []if ocr_result is not None: # 添加空值检查for line in ocr_result:if line is not None: # 添加行级空值检查for word_info in line:if isinstance(word_info, list) and len(word_info) >= 2:text = word_info[1][0].strip()confidence = word_info[1][1]if confidence > 0.6 and text:texts.append(text)return textsexcept Exception as e:print(f"文本提取错误: {str(e)}")return []def main():try:extractor = PDFTextExtractor()pdf_path = r"E:\z_library_books\平时作业\中国旅游文化_11608595(1).pdf"extractor.process_pdf(pdf_path)except Exception as e:print(f"程序执行失败: {str(e)}")if __name__ == "__main__":main()
补充说明:便于理解程序的运行流程
好的,让我更详细地解释每个步骤:
1. 程序启动和初始化
def main():try:# 1. 创建PDF文本提取器extractor = PDFTextExtractor()# 这一步会:# - 启动PaddleOCR引擎(文字识别工具)# - 设置识别中文# - 设置不使用GPU# - 设置要识别的颜色范围(黄色和红色)
2. 设置文件路径和开始处理
# 2. 设置PDF文件路径pdf_path = r"E:\z_library_books\平时作业\中国旅游文化_11608595(1).pdf"# 3. 开始处理PDFextractor.process_pdf(pdf_path)
3. PDF处理流程(process_pdf函数)
def process_pdf(self, pdf_path, output_path='extracted_text.txt'):try:# 1. 检查PDF文件是否存在if not os.path.exists(pdf_path):raise FileNotFoundError("PDF文件不存在")# 2. 记录开始时间start_time = time.time()# 3. 设置poppler工具路径(用于转换PDF为图片)poppler_path = r"E:\Proper\poppler-24.08.0\Library\bin"# 4. 获取PDF总页数total_pages = self.get_pdf_page_count(pdf_path, poppler_path)print(f"PDF总页数: {total_pages}")# 5. 创建输出文件with open(output_path, 'w', encoding='utf-8') as f:# 6. 逐页处理for page_num in range(1, total_pages + 1):# 处理每一页...
4. 单页处理流程
# 对于每一页:
try:# 1. 将PDF页面转换为图片pages = convert_from_path(pdf_path,first_page=page_num,last_page=page_num,dpi=200, # 设置图片清晰度poppler_path=poppler_path)# 2. 获取页面图片page = pages[0]# 3. 预处理图片img = self.preprocess_image(page)# - 调整图片大小# - 增加清晰度# - 调整亮度和对比度# 4. 处理每种颜色page_results = []for color_name, color_range in self.color_ranges.items():print(f"处理{color_name}色文本...")# 提取特定颜色的文本highlighted_text = self.extract_colored_text(img.copy(),color_range['lower'],color_range['upper'])if highlighted_text:page_results.extend(highlighted_text)# 5. 保存这一页的结果if page_results:f.write(f"\n第{page_num}页标注文本:\n")f.write('\n'.join(page_results) + '\n')
5. 文本提取流程(extract_colored_text函数)
def extract_colored_text(self, img, color_lower, color_upper):try:# 1. 转换颜色空间,便于找到高亮部分hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)# 2. 创建掩码(找出高亮部分)mask = cv2.inRange(hsv, color_lower, color_upper)# 3. 优化掩码kernel = np.ones((3, 3), np.uint8)mask = cv2.dilate(mask, kernel, iterations=2)mask = cv2.erode(mask, kernel, iterations=1)# 4. 提取高亮区域result = cv2.bitwise_and(img, img, mask=mask)# 5. 转为灰度图gray = cv2.cvtColor(result, cv2.COLOR_BGR2GRAY)# 6. OCR识别文字ocr_result = self.ocr.ocr(gray, cls=True)# 7. 处理识别结果texts = []if ocr_result is not None:for line in ocr_result:if line is not None:for word_info in line:text = word_info[1][0].strip()confidence = word_info[1][1]if confidence > 0.6 and text:texts.append(text)return texts
这个程序就像一个阅读助手:
- 先准备好工具(OCR引擎)
- 打开PDF文件
- 一页一页地:
- 把PDF页面转成图片
- 找出高亮的部分
- 识别高亮部分的文字
- 记录下识别到的文字
- 最后把所有记录的文字保存到文件中