一、思路分析
首先,拿到一张文档,我们需要对文档进行预处理操作,再进行轮廓检测,因为就算拿到文档轮廓,但是这些轮廓也有可能是歪歪扭扭的,这时候需要通过一系列的透视变换操作,将文档摆正。通过调用OCR函数库实现文档内容的识别。
二、导包及其相关函数
# 导入工具包
import numpy as np
import argparse
import cv2
# 设置参数
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--image", required = True,help = "Path to the image to be scanned")
args = vars(ap.parse_args())
def order_points(pts):# 一共4个坐标点rect = np.zeros((4, 2), dtype = "float32")# 按顺序找到对应坐标0123分别是 左上,右上,右下,左下# 计算左上,右下s = pts.sum(axis = 1)rect[0] = pts[np.argmin(s)]rect[2] = pts[np.argmax(s)]# 计算右上和左下diff = np.diff(pts, axis = 1)rect[1] = pts[np.argmin(diff)]rect[3] = pts[np.argmax(diff)]return rectdef four_point_transform(image, pts):# 获取输入坐标点rect = order_points(pts)(tl, tr, br, bl) = rect"""tl trbl br """# 计算输入的w和h值widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))maxWidth = max(int(widthA), int(widthB))#得到的轮廓不保障一点是个矩形,这里将两条width都计算出来,以最大的width最为最终矩形的widthheightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))maxHeight = max(int(heightA), int(heightB))# 变换后对应坐标位置 这时候就需要将文档摆正,通过识别出来原始文档的width和height信息来重新摆正,放到(0,0)为左上角,形成一个矩形dst = np.array([[0, 0],[maxWidth - 1, 0],[maxWidth - 1, maxHeight - 1],[0, maxHeight - 1]], dtype = "float32")# 计算变换矩阵M = cv2.getPerspectiveTransform(rect, dst)#rect为输入的原始图像的四个点,dst为最后摆正的四个点坐标;透视变换需要将原始的2维平面文档转换为3维空间中,变换之后再转回到2维平面中,调用该函数之后,此时M为3维矩阵warped = cv2.warpPerspective(image, M, (maxWidth, maxHeight))# 返回变换后结果return warpeddef resize(image, width=None, height=None, inter=cv2.INTER_AREA):#对图像的大小进行变换dim = None(h, w) = image.shape[:2]if width is None and height is None:return imageif width is None:r = height / float(h)dim = (int(w * r), height)else:r = width / float(w)dim = (width, int(h * r))resized = cv2.resize(image, dim, interpolation=inter)return resized
三、提取出文档的轮廓
# 读取输入
image = cv2.imread(args["image"])
#坐标也会相同变化
ratio = image.shape[0] / 500.0#后续需要对图像进行变换,这里先保留一下变换比例 shape[0]是width宽度
orig = image.copy()
image = resize(orig, height = 500)
# 预处理
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)#灰度图
gray = cv2.GaussianBlur(gray, (5, 5), 0)#高斯滤波去除一些噪音点
edged = cv2.Canny(gray, 75, 200)#Canny边缘检测
# 展示预处理结果
print("STEP 1: Canny边缘检测")
cv2.imshow("Image", image)
cv2.imshow("Edged", edged)
cv2.waitKey(0)
cv2.destroyAllWindows()
# 轮廓检测
cnts = cv2.findContours(edged.copy(), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)[1]#边缘检测之后再进行轮廓检测,轮廓检测这里使用的是边缘检测的结果
cnts = sorted(cnts, key = cv2.contourArea, reverse = True)[:5]#对不同的轮廓按照面积排序,这里按面积大小找前5个轮廓
# 遍历轮廓
for c in cnts:#遍历找到的前5个轮廓# 计算轮廓近似peri = cv2.arcLength(c, True)#计算下每个轮廓的长度# C表示输入的点集# epsilon表示从原始轮廓到近似轮廓的最大距离,它是一个准确度参数# True表示封闭的approx = cv2.approxPolyDP(c, 0.02 * peri, True)# 4个点的时候就拿出来if len(approx) == 4:#轮廓近似之后看看轮廓有几个点,若是4个点的轮廓,应该就是矩形了,是我们要找的文档对象screenCnt = approxbreak
# 展示结果
print("STEP 2: 获取轮廓")
cv2.drawContours(image, [screenCnt], -1, (0, 255, 0), 2)
cv2.imshow("Outline", image)
cv2.waitKey(0)
cv2.destroyAllWindows()
三、通过透视变换将文档图像摆正
# 透视变换
warped = four_point_transform(orig, screenCnt.reshape(4, 2) * ratio)
# orig为原始图像copy所得
# screenCnt.reshape(4, 2)拿到轮廓的四个坐标,但是这里是以及resize之后的四个点坐标,故需要乘以retio这个比例把四个点的坐标给还原回去
# 二值处理
warped = cv2.cvtColor(warped, cv2.COLOR_BGR2GRAY)#为了是得变化之后的图像更加清晰,对图像再次进行灰度化和二值化操作
ref = cv2.threshold(warped, 100, 255, cv2.THRESH_BINARY)[1]
cv2.imwrite('scan.jpg', ref)#保存一下识别出来的文档图像,为后续的识别内容做样本
# 展示结果
print("STEP 3: 透视变换")
cv2.imshow("Original", resize(orig, height = 650))
cv2.imshow("Scanned", resize(ref, height = 650))
cv2.waitKey(0)
此时就会将扫描得到的文档进行保存
后续的OCR识别就是针对这个文档进行的。
四、Pycharm参数设定
对于 文档进行透视变换摆正 需要提供参数,指定图像路径
找到Edit Configurations
将image参数改成自己测试图像路径
--image yy.png
,只需要重新指定后面的yy.png图像路径即可
五、OCR对文档内容进行识别
Ⅰ,工具包下载安装
tesseract工具包下载,windows就直接找个版本下载exe文件就行,直接下一步就行,需要注意一下按照的路径。
Ⅱ,配置环境变量
cmd下进行测试看看安装是否成功,tesseract -v
输出版本信息即可
Ⅲ,测试一下
例如在D盘下有一张图像
tesseract beyondyanyu.jpg yy
beyondyanyu.jpg为测试图像路径,yy生成结果路径及名称
注意你存放测试图像的路径
报错解决方法一:
若在此处报错,需要重新新建一个系统变量
TESSDATA_PREFIX
C:\Program Files (x86)\Tesseract-OCR\tessdata
电脑关机重启一下。
再进行上述测试,看看有没有生成结果。
报错解决方法二:
G:\Anaconda3\Lib\site-packages\pytesseract
把此处的路径更改成下载的位置的全局路径即可。
Ⅳ,安装pytesseract
我用的是Anaconda,在Anaconda prompt中输入:pip install pytesseract
Ⅴ,测试
from PIL import Image
import pytesseract
import cv2
import ospreprocess = 'thresh' #thresh 滤波二值化操作可以试试image = cv2.imread('scan.jpg')#读取得到的透视变换摆正之后的测试图像
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)#if preprocess == "thresh":gray = cv2.threshold(gray, 0, 255,cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]if preprocess == "blur":gray = cv2.medianBlur(gray, 3)filename = "{}.png".format(os.getpid())
cv2.imwrite(filename, gray)text = pytesseract.image_to_string(Image.open(filename))
print(text)
os.remove(filename)cv2.imshow("Image", image)
cv2.imshow("Output", gray)
cv2.waitKey(0)
控制台输出:
"""
Pasuze InoQuestions 12 to 15 are based on the passage you have just heard.2 Ai Moveable me1 type began to be used in privung
B) Chmese priwing technology was first introduced
C) The earliest known baok was publishedD) Metal type was imported from Korea13. A) It had more than a hundred printing pressesB) It was the bignest printer in the 16th centuryC) It helped the Geman people become lnerate
D; It produced some 20 milion volumes in total14. Aj It pushed hindwntten books out of circulation
B) It boosted the cnculation of popular works
C) It made writing a very profitable career
1) It provided readers with more choicesAj It acceterated the extinction of the Latin languageB) It sndardized the publication of grammur books
C) It turned translation into a welcome professionD) it promoted the growth of national linguagesThe earliest printed book we know today speared in China in the year 868, and metal type was
in use in Kore at the begining of the fifteenth century, but t was in Germany around the year 1450that a printing press using. movable metal type was invented. Capitalism turned printing from anmncenton into an industry. Richt from the start, book printing and publishing were organized oncapitalist fines. The biggest sinteenth-century printer, Plantin of Antwerp, had. twentfour pruing
prewes and employed more than a hundred workers. Only a small fraction of the population was
but the production of books grew at an extraordinary speed. By 1500 some twenty million
volumes hed atready been printedTheeffect of printing was to increase the circulation of works that were already
popular in a handwritten form, while less popular works went out of circulation. Publishers: were
only in books that would sell faily queldy in sufficient numbers to caver the costs of
production and make a profit. Thus, while priting enormously increased access to books by making
cheap. high- salume production possible, it also reduced choiceThe wieat cultural impact of printing was thi it ficifitared thewth of national languages
Most carly books were printed in Latin, but the market for Latin was limited. and in its pursuit of$Process finished with exit code 0
"""
六、完整代码
对文档进行透视变换摆正
# 导入工具包
import numpy as np
import argparse
import cv2# 设置参数
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--image", required = True,help = "Path to the image to be scanned")
args = vars(ap.parse_args())def order_points(pts):# 一共4个坐标点rect = np.zeros((4, 2), dtype = "float32")# 按顺序找到对应坐标0123分别是 左上,右上,右下,左下# 计算左上,右下s = pts.sum(axis = 1)rect[0] = pts[np.argmin(s)]rect[2] = pts[np.argmax(s)]# 计算右上和左下diff = np.diff(pts, axis = 1)rect[1] = pts[np.argmin(diff)]rect[3] = pts[np.argmax(diff)]return rectdef four_point_transform(image, pts):# 获取输入坐标点rect = order_points(pts)(tl, tr, br, bl) = rect"""tl trbl br """# 计算输入的w和h值widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))maxWidth = max(int(widthA), int(widthB))#得到的轮廓不保障一点是个矩形,这里将两条width都计算出来,以最大的width最为最终矩形的widthheightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))maxHeight = max(int(heightA), int(heightB))# 变换后对应坐标位置 这时候就需要将文档摆正,通过识别出来原始文档的width和height信息来重新摆正,放到(0,0)为左上角,形成一个矩形dst = np.array([[0, 0],[maxWidth - 1, 0],[maxWidth - 1, maxHeight - 1],[0, maxHeight - 1]], dtype = "float32")# 计算变换矩阵M = cv2.getPerspectiveTransform(rect, dst)#rect为输入的原始图像的四个点,dst为最后摆正的四个点坐标;透视变换需要将原始的2维平面文档转换为3维空间中,变换之后再转回到2维平面中,调用该函数之后,此时M为3维矩阵warped = cv2.warpPerspective(image, M, (maxWidth, maxHeight))# 返回变换后结果return warpeddef resize(image, width=None, height=None, inter=cv2.INTER_AREA):#对图像的大小进行变换dim = None(h, w) = image.shape[:2]if width is None and height is None:return imageif width is None:r = height / float(h)dim = (int(w * r), height)else:r = width / float(w)dim = (width, int(h * r))resized = cv2.resize(image, dim, interpolation=inter)return resized# 读取输入
image = cv2.imread(args["image"])
#坐标也会相同变化
ratio = image.shape[0] / 500.0#后续需要对图像进行变换,这里先保留一下变换比例 shape[0]是width宽度
orig = image.copy()image = resize(orig, height = 500)# 预处理
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)#灰度图
gray = cv2.GaussianBlur(gray, (5, 5), 0)#高斯滤波去除一些噪音点
edged = cv2.Canny(gray, 75, 200)#Canny边缘检测# 展示预处理结果
print("STEP 1: Canny边缘检测")
cv2.imshow("Image", image)
cv2.imshow("Edged", edged)
cv2.waitKey(0)
cv2.destroyAllWindows()# 轮廓检测
cnts = cv2.findContours(edged.copy(), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)[1]#边缘检测之后再进行轮廓检测,轮廓检测这里使用的是边缘检测的结果
cnts = sorted(cnts, key = cv2.contourArea, reverse = True)[:5]#对不同的轮廓按照面积排序,这里按面积大小找前5个轮廓# 遍历轮廓
for c in cnts:#遍历找到的前5个轮廓# 计算轮廓近似peri = cv2.arcLength(c, True)#计算下每个轮廓的长度# C表示输入的点集# epsilon表示从原始轮廓到近似轮廓的最大距离,它是一个准确度参数# True表示封闭的approx = cv2.approxPolyDP(c, 0.02 * peri, True)# 4个点的时候就拿出来if len(approx) == 4:#轮廓近似之后看看轮廓有几个点,若是4个点的轮廓,应该就是矩形了,是我们要找的文档对象screenCnt = approxbreak# 展示结果
print("STEP 2: 获取轮廓")
cv2.drawContours(image, [screenCnt], -1, (0, 255, 0), 2)
cv2.imshow("Outline", image)
cv2.waitKey(0)
cv2.destroyAllWindows()# 透视变换
warped = four_point_transform(orig, screenCnt.reshape(4, 2) * ratio)
# orig为原始图像copy所得
# screenCnt.reshape(4, 2)拿到轮廓的四个坐标,但是这里是以及resize之后的四个点坐标,故需要乘以retio这个比例把四个点的坐标给还原回去# 二值处理
warped = cv2.cvtColor(warped, cv2.COLOR_BGR2GRAY)#为了是得变化之后的图像更加清晰,对图像再次进行灰度化和二值化操作
ref = cv2.threshold(warped, 100, 255, cv2.THRESH_BINARY)[1]
cv2.imwrite('scan.jpg', ref)#保存一下识别出来的文档图像
# 展示结果
print("STEP 3: 透视变换")
cv2.imshow("Original", resize(orig, height = 650))
cv2.imshow("Scanned", resize(ref, height = 650))
cv2.waitKey(0)
对摆正之后的图像进行OCR识别
# https://digi.bib.uni-mannheim.de/tesseract/
# 配置环境变量如E:\Program Files (x86)\Tesseract-OCR
# tesseract -v进行测试
# tesseract XXX.png 得到结果
# pip install pytesseract
# anaconda lib site-packges pytesseract pytesseract.py
# tesseract_cmd 修改为绝对路径即可
from PIL import Image
import pytesseract
import cv2
import ospreprocess = 'thresh' #thresh 滤波二值化操作可以试试image = cv2.imread('scan.jpg')#读取得到的透视变换摆正之后的测试图像
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)#if preprocess == "thresh":gray = cv2.threshold(gray, 0, 255,cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]if preprocess == "blur":gray = cv2.medianBlur(gray, 3)filename = "{}.png".format(os.getpid())
cv2.imwrite(filename, gray)text = pytesseract.image_to_string(Image.open(filename))
print(text)
os.remove(filename)cv2.imshow("Image", image)
cv2.imshow("Output", gray)
cv2.waitKey(0)