利用scipy包计算表格线的峰值,还原表格得到表格结构

1. 利用scipy包计算表格线的峰值

import cv2
import numpy as np
from scipy.signal import find_peaks, peak_widthsdef get_lines_from_image(img_bin, axis, kernel_len_div = 20, kernel_len = None, iters = 3):""":param img_bin: opencv img:param axis: 0 对应竖直， 1对应水平线:param kernel_len_div: 相对于边长的几分之几:param kernel_len: 直接给定和长度，如果这个长度不为０，　上述例子失效:return:"""DEBUG = True# Defining a kernel lengthif kernel_len is not None:assert kernel_len > 0kernel_length = kernel_lenelse:kernel_length = max(np.array(img_bin).shape[axis] // kernel_len_div, 1)if axis == 0:# A verticle kernel of (1 X kernel_length), which will detect all the verticle lines from the image.verticle_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_length))# Morphological operation to detect verticle lines from an imageimg_temp1 = cv2.erode(img_bin, verticle_kernel, iterations=iters)verticle_lines_img = cv2.dilate(img_temp1, verticle_kernel, iterations=iters)if DEBUG:cv2.imwrite("verticle_lines.jpg", verticle_lines_img)return verticle_lines_imgelse:# A horizontal kernel of (kernel_length X 1), which will help to detect all the horizontal line from the image.hori_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_length, 1))# Morphological operation to detect horizontal lines from an imageimg_temp2 = cv2.erode(img_bin, hori_kernel, iterations=iters)horizontal_lines_img = cv2.dilate(img_temp2, hori_kernel, iterations=iters)if DEBUG:cv2.imwrite("horizontal_lines.jpg", horizontal_lines_img)return horizontal_lines_imgdef line_img_add(verticle_lines_img, horizontal_lines_img):# 把检测出来的横线和竖线相加alpha = 0.5beta = 1.0 - alphaimg_final_bin = cv2.addWeighted(verticle_lines_img, alpha, horizontal_lines_img, beta, 0.0)return img_final_bindef project(np_arr, axis):# 水平或垂直投影, 0竖直，1水平return np.count_nonzero(np_arr == 0, axis=axis)def get_grid_coordinate(img_bin, prominence_ratio = 0.3, height_ratio=None, distance=None, DEBUG=0):"""计算格点水平(x)和竖直(y)坐标和线宽:param img_bin: 白底黑线:return:"""#参数# prominence_ratio 峰值的突出程度, 相对于表格长宽h, w = img_bin.shape# print("size",h,w)x_prj = project(img_bin, 0)y_prj = project(img_bin, 1)# 检测峰值# high_ratio = 0.1 # todo 这也是一个参数height_x = height_y = Noneif height_ratio is not None:height_x = height_ratio * hheight_y = height_ratio * w# x_peaks, _ = find_peaks(x_prj, height=high_ratio*h, distance = max(1,w/20), prominence=(h*prominence_ratio, None))# y_peaks, _ = find_peaks(y_prj, height=high_ratio*w, distance = max(1,w/50), prominence=(w*prominence_ratio, None))print('height_x,height_y:', height_x, height_y)x_peaks, _ = find_peaks(x_prj, height=height_x, distance=distance,  prominence=(h * prominence_ratio, None))y_peaks, _ = find_peaks(y_prj, height=height_y, distance=distance, prominence=(w * prominence_ratio, None))x_peaks = list(x_peaks)y_peaks = list(y_peaks)DEBUG =Trueif DEBUG:#plotimport matplotlib.pyplot as pltimg = img_binplt.subplot(211)plt.title("x")print('range(x_prj.shape[0]):',range(x_prj.shape[0]))plt.plot(range(x_prj.shape[0]), x_prj)plt.plot(x_peaks, x_prj[x_peaks], "x")plt.subplot(212)plt.title("y")plt.plot(range(y_prj.shape[0]), y_prj)plt.plot(y_peaks, y_prj[y_peaks], "x")plt.show()if len(x_peaks) == 0: # 如果没检测到峰值, 把检测框边界峰值x_peaks = [0, w]print("x_peaks is None !!!!!!!")if len(y_peaks) == 0:y_peaks = [0, h]print("y_peaks is None !!!!!!!")# 计算线宽, 假设线宽一定, 横有m根线, 竖有n根线, 表格高为h, 宽为w, 线宽为x# n_nonzero = m*w*x + n*h*x - m*n*x^2#　n_nonzero 约等于 m*w*x + n*h*xh,w = img_bin.shapem,n = len(y_peaks), len(x_peaks)line_width = np.count_nonzero(img_bin == 0) / (m*w + n*h)line_width = round(line_width) + 1return x_peaks, y_peaks, line_widthif __name__ == '__main__':path= './test_page_debug_out_debug/table_crop_fix_rm_char.jpg'img = cv2.imread(path)img_bin = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)verticle_lines_img = get_lines_from_image(img_bin, 0, kernel_len_div=40)horizontal_lines_img = get_lines_from_image(img_bin, 1, kernel_len_div=40)# 表格线提取img_final_bin_lines = line_img_add(verticle_lines_img, horizontal_lines_img)cv2.imwrite('./img_final_bin_lines.jpg',img_final_bin_lines)kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))# 膨胀并二值化img_final_bin_lines = cv2.erode(~img_final_bin_lines, kernel, iterations=2)(thresh, img_final_bin_lines) = cv2.threshold(img_final_bin_lines, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)cv2.imwrite('./img_final_bin_lines_fix.jpg', img_final_bin_lines)# 根据表格线计算格点坐标 -----------------------------------x_grids, y_grids, line_w = get_grid_coordinate(img_final_bin_lines)

输入：

提取竖直线：

提取水平线：

水平线与竖直线峰值查找：

二. 还原表格结构

import cv2
from PIL import Image
import numpy as np
import os
import os.path as osp
from scipy.signal import find_peaks, peak_widthsdebug = Truedef get_lines_from_image(img_bin, axis, kernel_len_div=20, kernel_len=None, iters=3):""":param img_bin: opencv img:param axis: 0 对应竖直， 1对应水平线:param kernel_len_div: 相对于边长的几分之几:param kernel_len: 直接给定和长度，如果这个长度不为０，　上述例子失效:return:"""DEBUG = 0# Defining a kernel lengthif kernel_len is not None:assert kernel_len > 0kernel_length = kernel_lenelse:kernel_length = max(np.array(img_bin).shape[axis] // kernel_len_div, 1)if axis == 0:# A verticle kernel of (1 X kernel_length), which will detect all the verticle lines from the image.verticle_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_length))# Morphological operation to detect verticle lines from an imageimg_temp1 = cv2.erode(img_bin, verticle_kernel, iterations=iters)verticle_lines_img = cv2.dilate(img_temp1, verticle_kernel, iterations=iters)if DEBUG:cv2.imwrite("verticle_lines.jpg", verticle_lines_img)return verticle_lines_imgelse:# A horizontal kernel of (kernel_length X 1), which will help to detect all the horizontal line from the image.hori_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_length, 1))# Morphological operation to detect horizontal lines from an imageimg_temp2 = cv2.erode(img_bin, hori_kernel, iterations=iters)horizontal_lines_img = cv2.dilate(img_temp2, hori_kernel, iterations=iters)if DEBUG:cv2.imwrite("horizontal_lines.jpg", horizontal_lines_img)return horizontal_lines_imgdef line_img_add(verticle_lines_img, horizontal_lines_img):# Weighting parameters, this will decide the quantity of an image to be added to make a new image.alpha = 0.5beta = 1.0 - alpha# This function helps to add two image with specific weight parameter to get a third image as summation of two image.img_final_bin = cv2.addWeighted(verticle_lines_img, alpha, horizontal_lines_img, beta, 0.0)return img_final_bindef project(np_arr, axis):# 水平或垂直投影, 0竖直，1水平return np.count_nonzero(np_arr == 0, axis=axis)def get_grid_coordinate(img_bin, prominence_ratio=0.3, height_ratio=None, distance=None):"""计算格点水平(x)和竖直(y)坐标和线宽:param img_bin: 白底黑线:return:"""# 参数# prominence_ratio 峰值的突出程度, 相对于表格长宽h, w = img_bin.shapeDEBUG = Falseif DEBUG:cv2.imwrite('table_crop.jpg', img_bin)# print("size",h,w)x_prj = project(img_bin, 0)y_prj = project(img_bin, 1)# 检测峰值# high_ratio = 0.1 # todo 这也是一个参数height_x = height_y = Noneif height_ratio is not None:height_x = height_ratio * hheight_y = height_ratio * w# x_peaks, _ = find_peaks(x_prj, height=high_ratio*h, distance = max(1,w/20), prominence=(h*prominence_ratio, None))# y_peaks, _ = find_peaks(y_prj, height=high_ratio*w, distance = max(1,w/50), prominence=(w*prominence_ratio, None))x_peaks, _ = find_peaks(x_prj, height=height_x, distance=distance, prominence=(h * prominence_ratio, None))y_peaks, _ = find_peaks(y_prj, height=height_y, distance=distance, prominence=(w * prominence_ratio, None))if DEBUG:# plotimport matplotlib.pyplot as pltimg = img_binplt.subplot(211)plt.title("x")plt.plot(range(x_prj.shape[0]), x_prj)plt.plot(x_peaks, x_prj[x_peaks], "x")plt.subplot(212)plt.title("y")plt.plot(range(y_prj.shape[0]), y_prj)plt.plot(y_peaks, y_prj[y_peaks], "x")plt.show()# cv2.waitKey(0)if len(x_peaks) == 0:  # 如果没检测到峰值, 把检测框边界峰值x_peaks = [0, w]# print("x_peaks is None !!!!!!!")if len(y_peaks) == 0:y_peaks = [0, h]# print("y_peaks is None !!!!!!!")# 计算线宽, 假设线宽一定, 横有m根线, 竖有n根线, 表格高为h, 宽为w, 线宽为x# n_nonzero = m*w*x + n*h*x - m*n*x^2# 　n_nonzero 约等于 m*w*x + n*h*xh, w = img_bin.shapem, n = len(y_peaks), len(x_peaks)line_width = np.count_nonzero(img_bin == 0) / (m * w + n * h)line_width = max(round(line_width), 1)return list(x_peaks), list(y_peaks), line_width
def check_line_exist(img_bin, pt1, pt2, width, threshold=0.5, DEBUG=0):# 剪切图片以加速x1 = min(pt1[0], pt2[0])x2 = max(pt1[0], pt2[0])y1 = min(pt1[1], pt2[1])y2 = max(pt1[1], pt2[1])h, w = img_bin.shaped = width * 2x1 = max(0, x1 - d)y1 = max(0, y1 - d)x2 = min(w-1, x2 + d)y2 = min(h-1, y2 + d)img_bin = img_bin[y1: y2, x1: x2].copy()pt1 = (pt1[0] - x1, pt1[1] - y1)pt2 = (pt2[0] - x1, pt2[1] - y1)if DEBUG:cv2.imwrite('./img_bin_after_crop.jpg', img_bin)# print("now check", pt1, pt2)line_mask = np.zeros_like(img_bin)cv2.line(line_mask, pt1, pt2, color=(255, 255, 255), thickness=width)mask_cnt = np.count_nonzero(line_mask)img_bin_tmp = ~img_bin.copy()kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))img_bin_tmp = cv2.dilate(img_bin_tmp, kernel, iterations=1)img_after_mask = cv2.bitwise_and(line_mask, img_bin_tmp)and_cnt = np.count_nonzero(img_after_mask)if DEBUG:cv2.imwrite("./line_mask.jpg", line_mask)cv2.imwrite('./img_after_mask.jpg', img_after_mask)cv2.imwrite("./img_bin_tmp.jpg", img_bin_tmp)# print('check_line_exist', (and_cnt / mask_cnt))return (and_cnt / mask_cnt) > thresholddef get_table_structure(img_final_bin_lines, x_grids, y_grids, line_w):# 推断表格结构# 判断每条边是否存在, 不存在在改边两边区域相连DEBUG = 0n_x = len(x_grids)n_y = len(y_grids)if DEBUG:print("n_x, n_y", n_x, n_y)cell_id_mark = np.full((n_y - 1, n_x - 1), -1, dtype=int)  # 给每个cell一个id，id相同代表联通cell_id_sets = [set() for _ in range(n_x * n_y)]  # 记录每个id包含哪些cellid = 0# def f(row, col):#     # 单元格坐标转序号#     return n_x*row + col## def f_revers(id):#     # 序号转单元格坐标#     return (id//n_x, id%n_x)# 检查竖直线if len(x_grids) > 2:for x_id, x in enumerate(x_grids[1:-1]):x_id += 1  # 因为是从1开始for y_id, y in enumerate(y_grids[:-1]):if not check_line_exist(img_final_bin_lines, (x, y), (x, y_grids[y_id + 1]), width=line_w,threshold=0.5, DEBUG=False):# if DEBUG:print("没有发现竖直线:x_id,y_id", x_id, y_id)left_id = cell_id_mark[y_id, x_id - 1]# print('==left_id:', left_id)if left_id == -1:cell_id_mark[y_id, x_id - 1] = idcell_id_mark[y_id, x_id] = idcell_id_sets[id].add((y_id, x_id - 1))cell_id_sets[id].add((y_id, x_id))id += 1else:cell_id_mark[y_id, x_id] = left_idcell_id_sets[left_id].add((y_id, x_id))print('==cell_id_mark:', cell_id_mark)# assert 1 == 0# print('cell_id_sets', cell_id_sets)# 检查水平线if len(y_grids) > 2:for y_id, y in enumerate(y_grids[1:-1]):y_id += 1for x_id, x in enumerate(x_grids[:-1]):# print(cell_id_mark)if not check_line_exist(img_final_bin_lines, (x_grids[x_id + 1], y), (x, y), width=line_w,threshold=0.5, DEBUG=False):# if DEBUG:print("======没有发现水平线,x_id, y_id", x_id, y_id)up_id = cell_id_mark[y_id - 1, x_id]down_id = cell_id_mark[y_id, x_id]# print('===up_id:', up_id)# print('===down_id:', down_id)if up_id != -1:if down_id != -1:if up_id != down_id:  # 合并同一区域的id# print('cell_id_sets[up_id]',cell_id_sets[up_id])# print('cell_id_sets[down_id]', cell_id_sets[down_id])cell_id_mark[y_id, x_id] = up_idcell_id_sets[up_id] |= cell_id_sets[down_id]cell_id_sets[down_id].clear()# print('cell_id_sets[up_id]',cell_id_sets[up_id])else:cell_id_mark[y_id, x_id] = up_idcell_id_sets[up_id].add((y_id, x_id))else:cell_id_mark[y_id - 1, x_id] = idcell_id_mark[y_id, x_id] = idcell_id_sets[id].add((y_id - 1, x_id))cell_id_sets[id].add((y_id, x_id))id += 1print('==cell_id_mark:', cell_id_mark)# assert 1 == 0print('==x_grids:', x_grids)print('==y_grids:', y_grids)print('==cell_id_mark:', cell_id_mark)print('==cell_id_sets:', cell_id_sets)# 填补其他没id的单元格依次加1for x_id, x in enumerate(x_grids[:-1]):for y_id, y in enumerate(y_grids[:-1]):if cell_id_mark[y_id, x_id] == -1:cell_id_mark[y_id, x_id] = idcell_id_sets[id].add((y_id, x_id))id += 1print('==cell_id_mark:', cell_id_mark)print('==cell_id_sets:', cell_id_sets)print('==id:', id)# assert 1 == 0# print('after check ver',cell_id_mark)# print(cell_id_sets)# 输出rst = []for id in range(id):if len(cell_id_sets[id]) == 0:continueif len(cell_id_sets[id]) == 1:cell = {}cell_row, cell_col = list(cell_id_sets[id])[0]cell["id"] = idcell["row_start"] = cell_row  # 结构坐标cell["col_start"] = cell_colcell["row_end"] = cell_row + 1cell["col_end"] = cell_col + 1cell["x1"] = x_grids[cell_col]  # 绝对坐标cell["y1"] = y_grids[cell_row]cell["x2"] = x_grids[cell_col + 1]cell["y2"] = y_grids[cell_row + 1]cell["crnn"] = []  # 后续使用cell["text"] = ""  # 后续使用rst.append(cell)else:id_min = sorted(cell_id_sets[id])[0]id_max = sorted(cell_id_sets[id])[-1]cell = {}cell_row_min, cell_col_min = id_mincell_row_max, cell_col_max = id_maxcell["id"] = idcell["row_start"] = cell_row_min  # 结构坐标cell["col_start"] = cell_col_mincell["row_end"] = cell_row_max + 1cell["col_end"] = cell_col_max + 1cell["x1"] = x_grids[cell_col_min]  # 绝对坐标cell["y1"] = y_grids[cell_row_min]cell["x2"] = x_grids[cell_col_max + 1]cell["y2"] = y_grids[cell_row_max + 1]cell["crnn"] = []  # 后续使用cell["text"] = ""  # 后续使用rst.append(cell)return cell_id_mark, rstdef box_extraction(cv_img):"""提取有框线表格结构, 返回list [[row_start,col_start,row_end,col_end],[...]]:param img_path::param result_path::return:"""if len(cv_img.shape) == 3:cv_img = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)# 二值化# (thresh, img_bin) = cv2.threshold(cv_img, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)  # Thresholding the imageimg_bin = cv2.adaptiveThreshold(cv_img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, \cv2.THRESH_BINARY, 11, 2)img_bin = 255 - img_bin  # Invert the image# 二次消除小轮廓image, contours, hierarchy = cv2.findContours(img_bin, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)mask = np.ones(image.shape[:2], dtype="uint8") * 255th_w = img_bin.shape[1] / 30th_h = img_bin.shape[0] / 30for c in contours:x, y, w, h = cv2.boundingRect(c)  # 第一遍根据长宽删选if w < th_w and h < th_h:cv2.drawContours(mask, [c], -1, 0, -1)img_bin = cv2.bitwise_and(img_bin, img_bin, mask=mask)if debug:cv2.imwrite('./img_bin_no_noise.jpg', img_bin)kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))img_bin = cv2.dilate(img_bin, kernel, iterations=1)image, contours, hierarchy = cv2.findContours(img_bin, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)mask = np.ones(image.shape[:2], dtype="uint8") * 255th_w = img_bin.shape[1] / 5th_h = img_bin.shape[0] / 5for c in contours:if cv2.contourArea(c) < th_w * th_h:cv2.drawContours(mask, [c], -1, 0, -1)img_bin = cv2.bitwise_and(img_bin, img_bin, mask=mask)if debug:cv2.imwrite("img_remove_noise2.jpg", img_bin)verticle_lines_img = get_lines_from_image(img_bin, 0, kernel_len_div=40)horizontal_lines_img = get_lines_from_image(img_bin, 1, kernel_len_div=40)# 表格线提取img_final_bin_lines = line_img_add(verticle_lines_img, horizontal_lines_img)# 膨胀并二值化# A kernel of (3 X 3) ones.kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))img_final_bin_lines = cv2.erode(~img_final_bin_lines, kernel, iterations=2)(thresh, img_final_bin_lines) = cv2.threshold(img_final_bin_lines, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)if debug:cv2.imwrite("img_final_bin_lines.jpg", img_final_bin_lines)# 根据表格线计算格点坐标x_grids, y_grids, line_w = get_grid_coordinate(img_final_bin_lines)print('===x_grids, y_grids, line_w:', x_grids, y_grids, line_w)cell_id_mark, rst = get_table_structure(img_final_bin_lines, x_grids, y_grids, line_w)return x_grids, y_grids, cell_id_mark, rstdef debug_single_img():# img_path = './table_crop.jpg'img_path = './table_crop2.png'img = cv2.imread(img_path, 0)  # Read the imagex_grids, y_grids, cell_id_mark, rst = box_extraction(img)print('==x_grids:', x_grids)print('==y_grids:', y_grids)print('==cell_id_mark:', cell_id_mark)print('==rst:', rst)if __name__ == '__main__':debug_single_img()

在将rst接入这篇博客就还原出相应的excel啦.