# encoding: utf-8
# 版权所有 2023 涂聚文有限公司
# 许可信息查看:
# 描述:
# Author : geovindu,Geovin Du 涂聚文.
# IDE : PyCharm 2023.1 python 311
# Datetime : 2023/9/30 6:56
# User : geovindu
# Product : PyCharm
# Project : pythonTkinterDemo
# File : BaiduOCRAPI.py
# explain : 学习import os
import base64
import requests
import pandas as pd
import jsonclass BaiduOCR(object):"""利用百度API读取发票信息(pdf,image文件)"""AppID="40226401"APIKey="geovindu"SecretKey="geovindu"def __init__(self):""""""self.AppID="40226401"self.APIKey="geovindu" #self.SecretKey="geovindu" #def getAccessToken(self):""":param APIKey::param SecretKey::return:"""'''host = f"https://aip.baidubce.com/oauth/2.0/token?client_secret={self.SecretKey}&grant_type=client_credentials&client_id={self.APIKey}"response = requests.get(host)return response.json()['access_token']'''url = "https://aip.baidubce.com/oauth/2.0/token"params = {"grant_type": "client_credentials", "client_id": self.APIKey, "client_secret": self.SecretKey}return str(requests.post(url, params=params).json().get("access_token"))def getContent(self,accessToken, pdfFile):""":param accessToken:param pdfFile::return:"""#headers = {'content-type': 'application/x-www-form-urlencoded'}#request_url = f"https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice?access_token={accessToken}"f = open(pdfFile, 'rb')pdf = base64.b64encode(f.read())print(pdf)print(accessToken)request_url = f"https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice"params = {"pdf_file": pdf}access_token =accessToken # '[调用鉴权接口获取的token]'request_url = request_url + "?access_token=" + access_tokenheaders = {'content-type': 'application/x-www-form-urlencoded'}response = requests.post(request_url, data=params, headers=headers)if response:print(response.json())#print(pdf)#params = {"pdf_file": pdf}#response = requests.post(request_url, data=params, headers=headers)#print(response.json())return response.json()def getContentPng(self,accessToken, pngFile):""":param accessToken:param pngFile::return:"""#headers = {'content-type': 'application/x-www-form-urlencoded'}#request_url = f"https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice?access_token={accessToken}"f = open(pngFile, 'rb')pdf = base64.b64encode(f.read())print(pdf)print(accessToken)request_url = f"https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice"params = {"image": pdf}access_token =accessToken # '[调用鉴权接口获取的token]'request_url = request_url + "?access_token=" + access_tokenheaders = {'content-type': 'application/x-www-form-urlencoded'}response = requests.post(request_url, data=params, headers=headers)if response:print(response.json())#print(pdf)#params = {"pdf_file": pdf}#response = requests.post(request_url, data=params, headers=headers)#print(response.json())return response.json()def getUsefulInfo(self,content, pdf_name):""":param content:param pdf_name::return:"""jsonstr = contentprint("Json",jsonstr)words_result = jsonstr['words_result']info = {'发票文件名': pdf_name,'发票号码': str(words_result['InvoiceNum']),'开票日期': words_result['InvoiceDate'],'货物名称': words_result['CommodityName'][0]['word'],'未税金额': words_result['CommodityAmount'][0]['word'],'货物税率': words_result['CommodityTaxRate'][0]['word'],'货物税额': words_result['CommodityTax'][0]['word'],'合计金额': words_result['TotalAmount'],'合计税额': words_result['TotalTax'],'价税合计(小写)': words_result['AmountInFiguers'],'价税合计(大写)': words_result['AmountInWords'],'销售方名称': words_result['SellerName'],'销售方纳税人识别号': words_result['SellerRegisterNum'],'销售方银行及账户': words_result['SellerBank'],'销售方地址及电话': words_result['SellerAddress']}return info
调用:用京东多张发票测试成功
ocr=Common.BaiduOCRAPI.BaiduOCR()pdfFilelist = os.listdir("invoice/")infolist = []for pdfFile in pdfFilelist:if pdfFile.split(".")[-1] == 'pdf':pdfName = pdfFile.split(".")[:-1]print(pdfFile)access_token =ocr.getAccessToken()content = ocr.getContent(access_token, "invoice/" + pdfFile)info = ocr.getUsefulInfo(content, pdfName)infolist.append(info)df = pd.DataFrame(infolist)print(df)#df.to_excel('增值税发票信息统计.xlsx', sheet_name="geovindu",index=False)with pd.ExcelWriter('geovindu.xlsx') as writer: #, mode='a' 附加df.to_excel(writer, sheet_name='geovindu', index=False)