forked from chineseocr/table-detect
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtable_ceil.py
330 lines (281 loc) · 12.6 KB
/
table_ceil.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import cv2
from PIL import Image
import numpy as np
from table_detect import table_detect
from table_line import table_line
from table_build import tableBuid,to_excel
from utils import minAreaRectbox, measure, eval_angle, draw_lines
from chineseocr_lite.test import ChineseOcr
class table:
def __init__(self, img, tableSize=(416, 416), tableLineSize=(1024, 1024), isTableDetect=False, isToExcel=False):
self.img = img
self.tableSize = tableSize
self.tableLineSize = tableLineSize
self.isTableDetect = isTableDetect
self.isToExcel = isToExcel
self.img_degree()
self.table_boxes_detect() ##表格定位
self.table_ceil() ##表格单元格定位
self.ocr = self.table_ocr() # yby_edit
self.table_build()
def img_degree(self):
img, degree = eval_angle(self.img, angleRange=[-15, 15])
self.img = img
self.degree = degree
def table_boxes_detect(self):
h, w = self.img.shape[:2]
if self.isTableDetect:
boxes, adBoxes, scores = table_detect(self.img, sc=self.tableSize, thresh=0.2, NMSthresh=0.3)
if len(boxes) == 0:
boxes = [[0, 0, w, h]]
adBoxes = [[0, 0, w, h]]
scores = [0]
else:
boxes = [[0, 0, w, h]]
adBoxes = [[0, 0, w, h]]
scores = [0]
self.boxes = boxes
self.adBoxes = adBoxes
self.scores = scores
def table_ceil(self):
###表格单元格
n = len(self.adBoxes)
self.tableCeilBoxes = []
self.childImgs = []
for i in range(n):
xmin, ymin, xmax, ymax = [int(x) for x in self.adBoxes[i]]
childImg = self.img[ymin:ymax, xmin:xmax]
rowboxes, colboxes = table_line(childImg[..., ::-1], size=self.tableLineSize, hprob=0.5, vprob=0.5)
tmp = np.zeros(self.img.shape[:2], dtype='uint8')
tmp = draw_lines(tmp, rowboxes + colboxes, color=255, lineW=2)
labels = measure.label(tmp < 255, connectivity=2) # 8连通区域标记
regions = measure.regionprops(labels)
ceilboxes = minAreaRectbox(regions, False, tmp.shape[1], tmp.shape[0], True, True)
ceilboxes = np.array(ceilboxes)
ceilboxes[:, [0, 2, 4, 6]] += xmin
ceilboxes[:, [1, 3, 5, 7]] += ymin
self.tableCeilBoxes.extend(ceilboxes)
self.childImgs.append(childImg)
def table_build(self):
tablebuild = tableBuid(self.tableCeilBoxes)
cor = tablebuild.cor
for i, line in enumerate(cor):
# line['text'] = 'table-test'##ocr
line['text'] = self.ocr[i]
if self.isToExcel:
workbook = to_excel(cor, workbook=None)
else:
workbook=None
self.res = cor
self.workbook = workbook
def table_ocr(self):
"""use ocr and match ceil"""
res = []
tablebuild = tableBuid(self.tableCeilBoxes)
for box in tablebuild.diagBoxes:
# 获取表格的左上角和右下角坐标
x1, y1, x2, y2 = box[0],box[1],box[2],box[3]
# 截取表格部分的图片
table_img = self.img[y1:y2, x1:x2]
# cv2.imwrite('./img_temp/cropped.png', table_img)
# img = "./img_temp/cropped.png"
x = ChineseOcr(table_img)
if x:
txt = ""
for i in range(len(x)):
if x[i]["text"]=="一":
txt = txt + " " + "---"
else:
txt = txt + " " + x[i]["text"]
res.append(txt[1:])
else:
res.append("---")
# print(res)
return res
# excel转html表格
def to_html(workbook=None, file_path=None):
"""
file_path存在,则代表从文件中读取workbook
workbook存在,则代表从内存中读取workbook
"""
import io
import xlrd
import xlwt
from xlutils.copy import copy
from xlwt.Utils import rowcol_to_cell
output = []
merged_cells = None
# 如果 file_path 存在,则从文件中读取工作簿
if file_path:
book = xlrd.open_workbook(file_path)
# 如果 workbook 存在,则从内存中读取工作簿
elif workbook:
# 将 xlwt.Workbook 对象写入内存中的二进制流
stream = io.BytesIO()
workbook.save(stream)
# 使用 xlrd.open_workbook() 函数打开内存中的二进制流并返回一个 xlrd.Book 对象
stream.seek(0)
book = xlrd.open_workbook(file_contents=stream.read())
else:
return "未提供有效的工作簿信息"
# 遍历工作簿中的表格
for sheet in book.sheets():
output.append('<table>')
merged_cells = sheet.merged_cells
for row in range(sheet.nrows):
output.append('<tr>')
for col in range(sheet.ncols):
cell_value = sheet.cell_value(row, col)
rowspan, colspan = get_merged_cell_range(merged_cells, row, col)
if rowspan or colspan:
output.append(f'<td rowspan="{rowspan+1}" colspan="{colspan+1}">{cell_value}</td>')
else:
output.append(f'<td>{cell_value}</td>')
output.append('</tr>')
output.append('</table>')
return '\n'.join(output)
# 确定单元格是否是合并单元格,并返回其跨度
def get_merged_cell_range(merged_cells, row, col):
for crange in merged_cells:
rlo, rhi, clo, chi = crange
if row == rlo and col == clo:
return rhi - rlo, chi - clo
return 0, 0
# 前端展示一张图的表格识别
def table_predict(img, short_size=480):
import os
import time
t = time.time()
tableDetect = table(img,tableSize=[416, 416],
tableLineSize=[1024, 1024],
isTableDetect=False,
isToExcel=True
)
tableCeilBoxes = tableDetect.tableCeilBoxes
tableJson = tableDetect.res
workbook = tableDetect.workbook
img = tableDetect.img
print(time.time() - t)
result = ""
if workbook is not None:
result = to_html(workbook=workbook)
return result
# 将html保存为本地文件
def save_html_to_file(html_string, file_path):
with open(file_path, 'w', encoding='utf-8') as f:
f.write(html_string)
if __name__ == '__main__':
import argparse
import os
import time
from utils import draw_boxes
parser = argparse.ArgumentParser(description='tabel to excel demo')
parser.add_argument('--isTableDetect', default=False, type=bool, help="是否先进行表格检测")
parser.add_argument('--tableSize', default='416,416', type=str, help="表格检测输入size")
parser.add_argument('--tableLineSize', default='1024,1024', type=str, help="表格直线输入size")
parser.add_argument('--isToExcel', default=False, type=bool, help="是否输出到excel")
parser.add_argument('--isToHtml', default=False, type=bool, help="是否输出html")
parser.add_argument('--folderPath', default='img', type=str, help="图像文件夹路径")
parser.add_argument('--jpgPath', default='',type=str, help="单张图像路径")
args = parser.parse_args()
args.tableSize = [int(x) for x in args.tableSize.split(',')]
args.tableLineSize = [int(x) for x in args.tableLineSize.split(',')]
print(args)
if args.jpgPath == '':
# 递归获取图像文件路径
img_paths = []
for root, dirs, files in os.walk(args.folderPath):
for file in files:
if file.endswith(".jpg") or file.endswith(".png"):
img_paths.append(os.path.join(root, file))
for img_path in img_paths:
img = cv2.imread(img_path)
# 灰度增强
gray_image = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
histogram = cv2.calcHist([gray_image],[0],None,[256],[0,256])
minimum_pixel_value, maximum_pixel_value, _, _ = cv2.minMaxLoc(gray_image)
for i in range(len(histogram)):
histogram_value = histogram[i]
if i < minimum_pixel_value or i > maximum_pixel_value:
histogram[i] = 0
else:
histogram[i] = int(255 * (i - minimum_pixel_value) / (maximum_pixel_value - minimum_pixel_value))
img = cv2.LUT(gray_image, histogram)
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
img = img.astype(np.uint8)
# # 边缘锐化增强
# kernel = np.array([[0, -2, 0], [-2, 9, -2], [0, -2, 0]])
# img = cv2.filter2D(img, -1, kernel)
t = time.time()
tableDetect = table(img,tableSize=args.tableSize,
tableLineSize=args.tableLineSize,
isTableDetect=args.isTableDetect,
isToExcel=args.isToExcel
)
tableCeilBoxes = tableDetect.tableCeilBoxes
tableJson = tableDetect.res
workbook = tableDetect.workbook
img = tableDetect.img
tmp = np.zeros_like(img)
img = draw_boxes(tmp, tableDetect.tableCeilBoxes, color=(255, 255, 255))
print(time.time() - t)
# 获取文件名和文件后缀
filename, extension = os.path.splitext(img_path)
# 拼接新的文件路径
new_path = os.path.join(os.path.dirname(os.path.dirname(filename)), 'result', os.path.basename(filename))
pngP = new_path+'ceil.png'
cv2.imwrite(pngP, img)
if workbook is not None:
workbook.save(new_path+'.xls')
if args.isToHtml == True:
html_string = to_html(file_path=new_path+'.xls')
save_html_to_file(html_string, new_path+'.html')
else:
img = cv2.imread(args.jpgPath)
# 灰度增强
gray_image = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
histogram = cv2.calcHist([gray_image],[0],None,[256],[0,256])
minimum_pixel_value, maximum_pixel_value, _, _ = cv2.minMaxLoc(gray_image)
for i in range(len(histogram)):
histogram_value = histogram[i]
if i < minimum_pixel_value or i > maximum_pixel_value:
histogram[i] = 0
else:
histogram[i] = int(255 * (i - minimum_pixel_value) / (maximum_pixel_value - minimum_pixel_value))
img = cv2.LUT(gray_image, histogram)
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
img = img.astype(np.uint8)
# # 边缘锐化增强
# kernel = np.array([[0, -2, 0], [-2, 9, -2], [0, -2, 0]])
# img = cv2.filter2D(img, -1, kernel)
t = time.time()
tableDetect = table(img,tableSize=args.tableSize,
tableLineSize=args.tableLineSize,
isTableDetect=args.isTableDetect,
isToExcel=args.isToExcel
)
tableCeilBoxes = tableDetect.tableCeilBoxes
tableJson = tableDetect.res
workbook = tableDetect.workbook
img = tableDetect.img
tmp = np.zeros_like(img)
img = draw_boxes(tmp, tableDetect.tableCeilBoxes, color=(255, 255, 255))
print(time.time() - t)
# 获取文件名和文件后缀
filename, extension = os.path.splitext(args.jpgPath)
# 拼接新的文件路径
new_path = os.path.join(os.path.dirname(os.path.dirname(filename)), 'result', os.path.basename(filename))
print(new_path)
pngP = new_path+'ceil.png'
cv2.imwrite(pngP, img)
if workbook is not None:
# workbook.save(os.path.splitext(args.jpgPath)[0]+'.xls')
workbook.save(new_path+'.xls')
if args.isToHtml == True:
# file_path = os.path.splitext(args.jpgPath)[0]
# html_string = to_html(file_path=file_path+'.xls')
# save_html_to_file(html_string, file_path+'.html')
html_string = to_html(file_path=new_path+'.xls')
save_html_to_file(html_string, new_path+'.html')