1.pdf转图片
import os
import numpy as np
import cv2
from PIL import Image
Image.MAX_IMAGE_PIXELS=None
import tempfile
import time
import sys
from pdf2image import convert_from_bytes# # 预处理程序
# sys.path.append("./data_prepare")
# from data_prepare.batch_pdf2jpg import pdf2jpg, pdf2pil_imgs
# from data_prepare.batch_rectify import batch_rectify2, rectify_single_img_fast
# sys.path.append("./tools")
# from tools.img_tools import cv_resize_long_edge, pil_resize_long_edgedef cv_resize_long_edge(cv_img, long_edge_length):# resize the cv_image(height, width) = cv_img.shape[:2]max_len = max(width, height)if max_len == long_edge_length:return cv_imgratio = long_edge_length / max_lenimg = cv2.resize(cv_img, None, fx=ratio, fy=ratio, interpolation=cv2.INTER_LINEAR)return imgdef pil_resize_long_edge(pil_img, long_edge_length):# resize the imagewidth, height = pil_img.sizemax_len = max(width, height)if max_len == long_edge_length:return pil_imgratio = max_len / long_edge_lengthimg = pil_img.resize((round(width/ratio), round(height/ratio)), Image.ANTIALIAS)return imgclass PDF:def __init__(self, pdf_bytes, model, dpi=300, n_threads=4, save_img_dir="./tmp_ocr_dir", small_size=1280):st = time.time()with tempfile.TemporaryDirectory() as tmp_out:pil_imgs = convert_from_bytes(pdf_bytes, output_folder=tmp_out, dpi=dpi, fmt="jpg", thread_count=n_threads)print("pdf拆分用时: 共 %d 页用时: %.3fs" % (len(pil_imgs), time.time() - st))self.pages = []start = time.time()# print("deal with:", pdf_path, "output:", pdf_rst_dir)if save_img_dir is not None:os.makedirs(save_img_dir, exist_ok=True)for i, pil_img in enumerate(pil_imgs):page = {}cv_img = np.array(pil_img)# pil_img to cv_imgcv_img = cv2.cvtColor(cv_img, cv2.COLOR_BGR2RGB)# 限制最大尺寸max_size = 5000if max(cv_img.shape[:2]) > max_size:cv_img = cv_resize_long_edge(cv_img, max_size)if model is not None:# 使用方向分类模型cv_img_org = rectify_single_img_fast(model, cv_img)else:cv_img_org = cv_imgpage["ocv"] = cv_img_org# origin cv img# page["opil"] = Image.fromarray(cv2.cvtColor(cv_img_org, cv2.COLOR_BGR2RGB)) # origin pil img# page["scv"] = cv_resize_long_edge(cv_img_org, small_size) # small cv image# page["spil"] = pil_resize_long_edge(page["opil"], small_size) # small pil imageself.pages.append(page)if save_img_dir is not None:img_path = os.path.join(save_img_dir, "%d.jpg" % i)page["img_path"] = img_pathcv2.imwrite(img_path, page["ocv"])print("pdf初始化及矫正: 共 %d 页用时: %.3fs" % (len(self.pages), time.time() - start))def __getitem__(self, i):return self.pages[i]def __len__(self):return len(self.pages)if __name__ == '__main__':tmp_dir = "./test_img"if not os.path.exists(tmp_dir):os.mkdir(tmp_dir)path = './me_pdf'pdfs_list_path = [os.path.join(path,i) for i in os.listdir(path)]for i, pdf_list_path in enumerate(pdfs_list_path):# pdf_file_path = '../chengdu/bank_test.pdf'pdf_file_path = pdf_list_pathpdf_bin = open(pdf_file_path, 'rb').read()pdf = PDF(pdf_bin, model=None, save_img_dir=None, small_size=2000)pdf_rst = []for i, page in enumerate(pdf):# img = Image.fromarray(page['ocv'][..., ::-1])# img.save(os.path.join(tmp_dir, '{}.jpg'.format(i)), dpi=(300.0, 300.0), quality=100)img = page['ocv']name = pdf_list_path.split('/')[-1].split('.')[0]cv2.imwrite(os.path.join(tmp_dir, name + '_' + str(i)+'.jpg'), img)
2.图片转pdf
from reportlab.lib.pagesizes import A4, portrait, landscape
from reportlab.pdfgen import canvas
import os
import cv2
# imgs_path = './需要转换成pdf图片'
# imgs_list_path = [os.path.join(imgs_path,i) for i in os.listdir(imgs_path)]
# imgs_list_path = sorted(imgs_list_path)
# for i, img_list_path in enumerate(imgs_list_path):
# if i<1:
# print('img_list_path:', img_list_path)def convert_images_to_pdf(imgs_path, pdf_path):pages = 0(w, h) = portrait(A4)c = canvas.Canvas(pdf_path, pagesize = portrait(A4))# l = os.listdir(img_path)# l.sort(key= lambda x:int(x[:-4]))imgs_list_path = [os.path.join(imgs_path, i) for i in os.listdir(imgs_path)]imgs_list_path = sorted(imgs_list_path)for img_list_path in imgs_list_path:# f = img_path + os.sep + str(img_list_path)c.drawImage(img_list_path, 0, 0, w, h)c.showPage()pages = pages + 1c.save()if __name__ == '__main__':# imgs_path = './需要转换成pdf图片'# path = './测试数据集_给梧州'# path = './红头文件/样本文件_jpg'path = './红头文件/身份证pdf'dirs_list_path = [os.path.join(path, i) for i in os.listdir(path)]for i, dir_list_path in enumerate(dirs_list_path):# imgs_path ='./需要转换成pdf图片3'# pdf_path = './good.pdf'pdf_path = str(i+1)+'.pdf'convert_images_to_pdf(dir_list_path, pdf_path)