使用Pytesseract OCR识别具有表格结构的图像时的错误字符 [英] Bad character recognition with Pytesseract OCR for images with table structure
问题描述
我使用代码定位文本框并在它们周围创建一个矩形。
但是,即使文本框检测效果很好,如果我尝试定义存在的字符,我也可以在图像中的表格结构周围重建网格。
在每个矩形中,pytesseract不能很好地识别它们,并且不允许找到原始文本。
这是我的Python代码:
import os
import cv2
import imutils
import argparse
numpy as np
import pytesseract
#仅在页面上只有一个表时才有效
#重要参数:
#-morph_size
#-min_text_height_limit
#-max_text_height_limit
#-单元阈值
#-min_columns
def pre_process_image(img,save_in_file,morph_size =(8,8)):
#获取摆脱颜色
pre = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
def img_estim(img,threshold = 127):
is_dark = np.mean(img)<阈值
返回true如果is_dark else否
#负
如果img_estim(pre):
print( non)
pre = cv2.bitwise_not (pre)
#对比度&亮度控制
对比度= 2.0#0至3
亮度= 0#0至100
对于y在范围内(pre.shape [0]):
范围内的x(pre.shape [1]):
pre [y,x] = np.clip(对比度* pre [y,x] +亮度,0、255)
#Otsu阈值
pre = cv2.threshold(pre,250,255,cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
#扩展文本使其成为固定点
cpy = pre.copy()
struct = cv2.getStructuringElement(cv2.MORPH_RECT,morph_size)
cpy = cv2.dilate(〜cpy,struct,anchor =(-1,-1),迭代= 1)
pre =〜cpy
如果save_in_file不是None:
cv2.imwrite(save_in_file,pre)
return pre
def find_text_boxes(pre,min_text_height_limit = 15,max_text_height_limit = 40):
#寻找轮廓的轮廓
#OpenCV 3
#img,轮廓,层次= cv2.findCont ours(pre,cv2.RETR_LIST,cv2.CHAIN_APPROX_SIMPLE)
#OpenCV 4
等高线,层次结构= cv2.findContours(pre,cv2.RETR_LIST,cv2.CHAIN_APPROX_SIMPLE)
#根据文本大小假设获得文本边界框
box = []
用于轮廓中的轮廓:
box = cv2.boundingRect(contour)
h = box [3]如果min_text_height_limit<
h < max_text_height_limit:
个box.append(box)
个返回框
def find_table_in_boxes(boxes,cell_threshold = 10,min_columns = 2):
行= {}
cols = {}
#按框的位置将边界框聚类为框
:
(x,y,w, h)=框
col_key = x //单元阈值
row_key = y //单元阈值
cols [row_key] = [box]如果col_key不在cols中,否则cols [col_key] + [box]
rows [row_key] = [box],如果row_key不在行中,则其他行[row_key] + [box]
#过滤出少于2个列的群集
table_cells = list(filter(lambda r:len(r)> = min_columns,rows.values()))
#按x坐标对行单元格进行排序
table_cells = [list(sorted(tb))为tb in table_cells]
#按y坐标对行进行排序
table_cells = list(sorted(tab le_cells,key = lambda r:r [0] [1]))
return table_cells
def build_lines(table_cells):
如果table_cells是None或len(table_cells)< = 0:
return [],[]
max_last_col_width_row = max(table_cells,key = lambda b:b [-1] [2])
max_x = max_last_col_width_row [-1] [0] + max_last_col_width_row [-1] [2]
max_last_row_height_box = max(table_cells [-1],key = lambda b:b [3] )
max_y = max_last_row_height_box [1] + max_last_row_height_box [3]
hor_lines = []
ver_lines = []
for table_cells中的框:
x = box [0] [0]
y = box [0] [1]
hor_lines.append((x,y,max_x,y))
为table_cells [0]中的框:
x =框[0]
y =框[1]
ver_lines.append((x,y,x,max_y))
(x,y,w,h)= table_cells [0] [-1]
ver_lines.a ppend((max_x,y,max_x,max_y))
(x,y,w,h)= table_cells [0] [0]
hor_lines.append((x,max_y,max_x,max_y) )
返回hor_lines,ver_lines
如果__name__ == __main__:
ap = argparse.ArgumentParser()
ap.add_argument(-i, --image,required = True,
help =输入要进行OCR的图像的路径)
#ap.add_argument(- east, --east,type = str,
#help =输入EAST文本检测器的路径)
args = vars(ap.parse_args())
in_file = os.path.join( images,args [ image])
pre_file = os.path.join( images, pre.png)
out_file = os.path.join( images, out.png)
img = cv2.imread(os.path.join(in_file))
top ,底部,左侧,右侧= [25] * 4
img = cv2.copyMakeBorder(img,顶部,底部,左侧,右侧,cv2.BORDER_REPLICATE)
orig = img.copy()
pre_processed = pre_process_image(img,pre_file)
text_boxes = find_text_boxes(pre_processed)
单元格= find_table_in_boxes(text_boxes)
hor_lines,ver_lines = build_lines(cells)
#( H,W)= img.shape [:2]
#net = cv2.dnn.readNet(args [ east])
#blob = cv2.dnn.blobFromImage(img,1.0,( W,H),(123.68,116.78,103.94),swapRB = True,crop = False)
#net.setInput(blob)
#可视化结果
vis = img.copy()
结果= []
用于text_boxes中的框:
(x,y,w,h)=框
startX = x -2
startY = y -2
endX = x + w
endY = y + h
cv2.rectangle(vis,(startX,startY) ,(endX,endY),(0,255,0),1)
roi = orig [startX:endX,startY:endY]
config =(- l eng --psm 6)
pytesseract .pytesseract.tesseract_cmd = r'C:\程序文件(x86)\ Tesseract-OCR\tesseract.exe'
text = pytesseract.image_to_string(roi,config = config)
results.append((((startX,startY,(endX),(endY)),text))
结果=已排序(结果,键= lambda r:r [0] [1])$ b
$ b输出= orig.copy()
用于((startX,startY,endX,endY),text)结果:
print( {} \n .format(text))
text = .join([[如果ord(c)< 128 else for c in text])。strip()
cv2.rectangle(output,(startX,startY),(endX,endY),(0,0,255),1)
cv2.putText(output,text,(startX,startY-20),cv2.FONT_HERSHEY_SIMPLEX,1,(0,0,255),3)
#for hor_lines:
#[x1,y1,x2,y2] =行
#cv2.line(vis,(x1,y1),(x2,y2),(0,0,255),1)
#在ver_lines中为行:
#[x1,y1,x2,y2] =行
#cv2.line(vis,(x1,y1),(x2,y2 ),(0,0,255),1)
cv2.imwrite(out_file,vis)
cv2.imshow(文本检测,输出)
cv2.waitKey(0)
初始图片:
初始图像
预处理后的图像,可检测文本轮廓以定义矩形的尺寸:
< a href = https://i.stack.imgur.com/pDr4M.png rel = nofollow noreferrer> P经过重新处理的图像,并检测出文本轮廓以定义矩形的尺寸
最终图像:
最终图像
OCR价:
a
ra
在
12
1
在此先感谢您的帮助,希望我的描述足够清楚。
执行OCR时,对图像进行预处理以使前景文本为黑色,背景为白色,这非常重要。另外,放大图像可以帮助改善检测结果。我还发现,将轻微的高斯模糊添加到Pytesseract之前可以提高准确性。以下是-psm 6
的结果,将图像视为单个文本块。在此处中查找更多配置选项。
经过预处理的放大,阈值和稍微模糊的图像
Pytesseract OCR的结果
系列类型扫描范围CTDIvol DLP幻影
(mm)(mGy)—(mGy-cm)cm
1侦察兵----
1侦察兵----
2轴向= 113.554-1272.929 11.22 269.35身体32
总测验DLP:= 269.35
1/1
代码
导入cv2
导入pytesseract
导入imutils
pytesseract.pytesseract.tesseract_cmd = r C:\Program Files\Tesseract-OCR\tesseract.exe
图像= cv2.imread('1.jpg')
图像= imutils.resize(图像,宽度= 700)
灰色= cv2.cvtColor(图像,cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(灰色,0,255,cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
thresh = cv2.GaussianBlur(thresh,(3,3),0)
数据= pytesseract.image_to_string(阈值, lang ='eng',config ='-psm 6')
print(data)
cv2.imshow('thresh',thresh)
cv2.imwrite(' thresh.png',thresh)
cv2.waitKey()
I use a code to locate text boxes and create a rectangle around them. This allows me to rebuild the grid around the table structure in the image.
However, even if the text box detection works very well, if I try to define the characters present in each rectangle, pytesseract does not identify them well and does not allow to find the original text.
Here is my Python code :
import os
import cv2
import imutils
import argparse
import numpy as np
import pytesseract
# This only works if there's only one table on a page
# Important parameters:
# - morph_size
# - min_text_height_limit
# - max_text_height_limit
# - cell_threshold
# - min_columns
def pre_process_image(img, save_in_file, morph_size=(8, 8)):
# get rid of the color
pre = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
def img_estim(img, threshold=127):
is_dark = np.mean(img) < threshold
return True if is_dark else False
# Negative
if img_estim(pre):
print("non")
pre = cv2.bitwise_not(pre)
# Contrast & Brightness control
contrast = 2.0 #0 to 3
brightness = 0 #0 to 100
for y in range(pre.shape[0]):
for x in range(pre.shape[1]):
pre[y,x] = np.clip(contrast*pre[y,x] + brightness, 0, 255)
# Otsu threshold
pre = cv2.threshold(pre, 250, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
# dilate the text to make it solid spot
cpy = pre.copy()
struct = cv2.getStructuringElement(cv2.MORPH_RECT, morph_size)
cpy = cv2.dilate(~cpy, struct, anchor=(-1, -1), iterations=1)
pre = ~cpy
if save_in_file is not None:
cv2.imwrite(save_in_file, pre)
return pre
def find_text_boxes(pre, min_text_height_limit=15, max_text_height_limit=40):
# Looking for the text spots contours
# OpenCV 3
# img, contours, hierarchy = cv2.findContours(pre, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
# OpenCV 4
contours, hierarchy = cv2.findContours(pre, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
# Getting the texts bounding boxes based on the text size assumptions
boxes = []
for contour in contours:
box = cv2.boundingRect(contour)
h = box[3]
if min_text_height_limit < h < max_text_height_limit:
boxes.append(box)
return boxes
def find_table_in_boxes(boxes, cell_threshold=10, min_columns=2):
rows = {}
cols = {}
# Clustering the bounding boxes by their positions
for box in boxes:
(x, y, w, h) = box
col_key = x // cell_threshold
row_key = y // cell_threshold
cols[row_key] = [box] if col_key not in cols else cols[col_key] + [box]
rows[row_key] = [box] if row_key not in rows else rows[row_key] + [box]
# Filtering out the clusters having less than 2 cols
table_cells = list(filter(lambda r: len(r) >= min_columns, rows.values()))
# Sorting the row cells by x coord
table_cells = [list(sorted(tb)) for tb in table_cells]
# Sorting rows by the y coord
table_cells = list(sorted(table_cells, key=lambda r: r[0][1]))
return table_cells
def build_lines(table_cells):
if table_cells is None or len(table_cells) <= 0:
return [], []
max_last_col_width_row = max(table_cells, key=lambda b: b[-1][2])
max_x = max_last_col_width_row[-1][0] + max_last_col_width_row[-1][2]
max_last_row_height_box = max(table_cells[-1], key=lambda b: b[3])
max_y = max_last_row_height_box[1] + max_last_row_height_box[3]
hor_lines = []
ver_lines = []
for box in table_cells:
x = box[0][0]
y = box[0][1]
hor_lines.append((x, y, max_x, y))
for box in table_cells[0]:
x = box[0]
y = box[1]
ver_lines.append((x, y, x, max_y))
(x, y, w, h) = table_cells[0][-1]
ver_lines.append((max_x, y, max_x, max_y))
(x, y, w, h) = table_cells[0][0]
hor_lines.append((x, max_y, max_x, max_y))
return hor_lines, ver_lines
if __name__ == "__main__":
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--image", required=True,
help="path to input image to be OCR'd")
# ap.add_argument("-east", "--east", type=str,
# help="path to input EAST text detector")
args = vars(ap.parse_args())
in_file = os.path.join("images", args["image"])
pre_file = os.path.join("images", "pre.png")
out_file = os.path.join("images", "out.png")
img = cv2.imread(os.path.join(in_file))
top, bottom, left, right = [25]*4
img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_REPLICATE)
orig = img.copy()
pre_processed = pre_process_image(img, pre_file)
text_boxes = find_text_boxes(pre_processed)
cells = find_table_in_boxes(text_boxes)
hor_lines, ver_lines = build_lines(cells)
# (H, W) = img.shape[:2]
# net = cv2.dnn.readNet(args["east"])
# blob = cv2.dnn.blobFromImage(img, 1.0, (W, H),(123.68, 116.78, 103.94), swapRB=True, crop=False)
# net.setInput(blob)
# Visualize the result
vis = img.copy()
results = []
for box in text_boxes:
(x, y, w, h) = box
startX = x -2
startY = y -2
endX = x + w
endY = y + h
cv2.rectangle(vis, (startX, startY), (endX, endY), (0, 255, 0), 1)
roi=orig[startX:endX,startY:endY]
config = ("-l eng --psm 6")
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe'
text = pytesseract.image_to_string(roi,config=config )
results.append(((startX, startY, (endX), (endY)), text))
results = sorted(results, key=lambda r:r[0][1])
output = orig.copy()
for ((startX, startY, endX, endY), text) in results:
print("{}\n".format(text))
text = "".join([c if ord(c) < 128 else "" for c in text]).strip()
cv2.rectangle(output, (startX, startY), (endX, endY),(0, 0, 255), 1)
cv2.putText(output, text, (startX, startY - 20),cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 3)
# for line in hor_lines:
# [x1, y1, x2, y2] = line
# cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)
# for line in ver_lines:
# [x1, y1, x2, y2] = line
# cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)
cv2.imwrite(out_file, vis)
cv2.imshow("Text Detection", output)
cv2.waitKey(0)
Initial image : Initial image Preprocessed image with detection of text outlines to define the dimensions of rectangles : Preprocessed image with detection of text outlines to define the dimensions of rectangles Final image : Final image Résultat obtenu par OCR :
"
a ra at
12
1 "
Thank you in advance for your help, hope my description is clear enough.
When performing OCR, it is extrememly important to preprocess the image to get the foreground text in black with the background in white. In addition, enlarging the image can help improve the detection results. I've also found that adding a slight Gaussian blur improves accuracy before throwing it into Pytesseract. Here's the results with --psm 6
to treat the image as a single block of text. Look here for more configuration options.
Preprocessed enlarged, thresholded, and slightly blurred image
Results from Pytesseract OCR
Series Type Scan Range CTDIvol DLP Phantom
(mm) (mGy) — (mGy-cm) cm
1 Scout - - - -
1 Scout - - - -
2 Axial = 113.554-1272.929 11.22 269.35 Body 32
Total Exam DLP: = 269.35
1/1
Code
import cv2
import pytesseract
import imutils
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
image = cv2.imread('1.jpg')
image = imutils.resize(image, width=700)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
thresh = cv2.GaussianBlur(thresh, (3,3), 0)
data = pytesseract.image_to_string(thresh, lang='eng', config='--psm 6')
print(data)
cv2.imshow('thresh', thresh)
cv2.imwrite('thresh.png', thresh)
cv2.waitKey()
这篇关于使用Pytesseract OCR识别具有表格结构的图像时的错误字符的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!