本文将重点介绍机器视觉的一个分支:文字识别,
介绍如何用一些 Python库来识别和使用在线图片中的文字
我们可以很轻松的阅读图片里的文字,但是机器阅读这些图片就会非常困难,利用这种人类用户可以正常读取但是大多数机器人都没法读取的图片,验证码 (CAPTCHA)就出现了
将图像翻译成文字一般被称为光学文字识别(Optical Character Recognition, OCR)
下载安装包,下载训练数据
https://github.com/tesseract-ocr/tesseract/wiki
https://github.com/tesseract-ocr/tesseract/wiki/Data-Files
import pytesseractimport urllib.requestfrom PIL import Imageurl = "https://so.gushiwen.org/RandCode.ashx?"urllib.request.urlretrieve(url,'./captcha.png')def cleanImage(path):image = Image.open(path)# 灰度化image = image.convert('L')# 背景杂点删除data = image.load()#返回图片的ndarray数据w,h = image.sizefor i in range(w):for j in range(h):if data[i,j] >150: #阈值,估计的值data[i,j] = 255 #纯白else:data[i,j] = 0 #纯黑image.save('clean_captcha.png')return imageimage = cleanImage('./captcha.png')image.show()result = pytesseract.image_to_string(image)
#
print(result)
古诗文网站使用selenium模拟登陆,识别验证码
from selenium import webdriver
import pytesseract
from PIL import Image
import timeurl = 'https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx'def get_captcha():driver = webdriver.Chrome()driver.maximize_window()driver.implicitly_wait(10)driver.get(url)# 元素img = driver.find_element_by_id('imgCode')driver.save_screenshot('./poem.png')image = Image.open('./poem.png')# 左上角坐标loc = img.locationprint(loc)# 图片宽度高度size = img.sizeprint(size)# 矩形区域# 160,260# 234,291rec = (loc['x']+100 , loc['y']+51, 554, 290)captcha = image.crop(rec)# 保存到文件中captcha.save('./captcha.png')return driverdef recognize_captcha():captcha = Image.open('./captcha.png')gray = captcha.convert('L')data = gray.load()w,h = captcha.sizefor x in range(w):for y in range(h):# 0 ~ 255 0纯黑,255纯白if data[x,y] < 140:data[x,y] = 0else:data[x,y] = 255code = pytesseract.image_to_string(gray)return codedef login(drive,code):drive.find_element_by_id('email').send_keys('xxxxxxx@qq.com')drive.find_element_by_id('pwd').send_keys('xxxxabc')drive.find_element_by_id('code').send_keys(code)time.sleep(3)drive.find_element_by_id('denglu').click()if __name__ == '__main__':drive = get_captcha()# code验证码,有可能出错code = recognize_captcha()# print('----------------',code)login(drive,code)drive.quit()