使用YOLO或其他图像识别技术来识别图像中存在的所有字母数字文本 [英] Using YOLO or other image recognition techniques to identify all alphanumeric text present in images

查看:277
本文介绍了使用YOLO或其他图像识别技术来识别图像中存在的所有字母数字文本的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我有多个图像图,所有图像图都包含字母数字字符标签,而不仅仅是文本标签本身。我希望我的YOLO模型可以识别所有数字和



如何训练我的YOLO模型做同样的事情。数据集可以在这里找到。



对于这些类型的图像
也需要执行相同的操作



可以在。尽管该模型捕获了大部分文本,但结果并非100%准确,并且可能由于在自然场景图像上的训练方式而偶尔出现误报。为了获得更准确的结果,您可能必须训练自己的自定义模型。但是,如果您需要一个体面的即用型解决方案,那么这应该对您有用。查看Adrian的 OpenCV文本检测(EAST文本检测器)博客文章,详细介绍了EAST文本检测器。



代码

  from imutils.object_detection import non_max_suppression 
import numpy as np
import cv2

def EAST_text_detector(original,image,confidence = 0.25):
#设置新的宽度和高度并确定更改的比率
(h,W)= image.shape [:2]
(newW,newH)=(640,640)
rW = W / float(newW)
rH = h / float(newH)

#调整图像大小并获取新图像尺寸
image = cv2.resize(image, (newW,newH))
(h,W)= image.shape [:2]

#定义EAST检测器模型的两个输出层名称,即
#我们感兴趣-第一个是输出概率,第二个是
# n用于导出文本
layerNames = [
feature_fusion / Conv_7 / Sigmoid,
feature_fusion / concat_3]的边界框坐标]

net = cv2.dnn.readNet('frozen_east_text_detection.pb')

#从图像构造一个Blob,然后执行
的前向传递#模型获得两个输出层集
blob = cv2.dnn.blobFromImage(image,1.0,(W,h),(123.68,116.78,103.94),swapRB = True,crop = False)
net.setInput(blob)
(分数,几何形状)= net.forward(layerNames)

#从分数卷中获取行数和列数,然后
#初始化我们的边界框矩形集和相应的
#置信度分数
(numRows,numCols)= scores.shape [2:4]
rects = []
置信度= []

# y在范围(0,numRows)中的行
的数目:
#提取分数(概率),然后提取g eometrical
#用于导出潜在边界框坐标的数据,其中
#环绕文本
scoresData = scores [0,0,y]
xData0 = geometry [0,0,y]
xData1 = geometry [0,1,y]
xData2 = geometry [0,2,y]
xData3 = geometry [0,3,y]
anglesData = geometry [ 0,4,y]

#循环遍历x范围(0,numCols)中的列数

#如果我们的分数没有足够的概率,请忽略如果scoresData [x]<置信度:
继续

#计算偏移因子,因为我们生成的特征图将使
#比输入图像小4倍
(offsetX,offsetY)=(x * 4.0,y * 4.0)

#提取预测的旋转角度,然后
#计算正弦和余弦
angle = anglesData [x]
cos = np.cos(角度)
sin = np.sin(角度)

#使用几何体积导出
的宽度和高度#边界框
h = xData0 [x] + xData2 [x]
w = xData1 [x] + xData3 [x]

#计算$ b的开始和结束(x,y)坐标$ b#文本预测边界框
endX = int(offsetX +(cos * xData1 [x])+(sin * xData2 [x]))
endY = int(offsetY-(sin * xData1 [x])+(cos * xData2 [x]))
startX = int(endX-w)
startY = int(endY-h)

#将边界框坐标和概率分数添加到
#我们各自的列表
rects。 append((startX,startY,endX,endY))
confidence.append(scoresData [x])

#应用非最大值抑制来抑制弱的重叠边界
#框
框= non_max_suppression(np.array(rects),概率=置信度)

#循环框(startX,startY,endX,endY)的边界框

#根据相应的
#比例缩放边界框坐标
startX = int(startX * rW)
startY = int(startY * rH)
endX = int(endX * rW)
endY = int(endY * rH)

#在图像上绘制边界框
cv2.rectangle(原始((startX,startY) ,(endX,endY),(36,255,12),2)
返回原始

#转换为gra yscale和Otsu的阈值
image = cv2.imread('1.png')
灰色= cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray,0, 255,cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
clean = thresh.copy()

#删除水平线
horizo​​ntal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15,1))
detect_horizo​​ntal = cv2.morphologyEx(阈值,cv2.MORPH_OPEN,horizo​​ntal_kernel,迭代次数= 2)
cnts = cv2.findContours(detect_horizo​​ntal,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE $ b cnts = cnts [0]如果len(cnts)== 2 else cnts [1]
for c in cnts:
cv2.drawContours(clean,[c],-1,0,3 )

#删除垂直线
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT,(1,30))
detect_vertical = cv2.morphologyEx(thresh,cv2.MORPH_OPEN,vertical_kernel ,迭代次数= 2)
cnts = cv2.findContours(detect_vertical,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts = cnts [0]如果len(cnts)== 2否则cnts [1]
c单位:
cv2.drawContours(clean,[c],-1,0,3)

#删除非文本轮廓(曲线,对角线,圆形)
cnts = cv2.findContours(clean,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts [0]如果len(cnts)== 2 else cnts [1]
for c in cnts:
area = cv2.contourArea(c)
如果area> 1500:
cv2.drawContours(clean,[c],-1,0,-1)
peri = cv2.arcLength(c,True)
rox = cv2.approxPolyDP(c, 0.02 * peri,True)
x,y,w,h = cv2.boundingRect(c)
如果len(approx)== 4:
cv2.rectangle(clean,(x,y ),(x + w,y + h),0,-1)

#按位并与原始图像一起删除轮廓
filter = cv2.bitwise_and(image,image,mask = clean)
filter [clean == 0] =(255,255,255)

#执行EAST文本检测
结果= EAST_text_detector(图像,已过滤)

cv2.imshow('已过滤',已过滤)
cv2.imshow('结果',结果)
cv2.waitKey()


I have multiple images diagram, all of which contains labels as alphanumeric characters instead of just the text label itself. I want my YOLO model to identify all the numbers & alphanumeric characters present in it.

How can I train my YOLO model to do the same. The dataset can be found here. https://drive.google.com/open?id=1iEkGcreFaBIJqUdAADDXJbUrSj99bvoi

For example : see the bounding boxes. I want YOLO to detect wherever the text are present. However currently its not necessary to identify the text inside it.

Also the same needs to be done for these type of images

The images can be downloaded here

This is what I have tried using opencv but it does not work for all the images in the dataset.

import cv2
import numpy as np
import pytesseract

pytesseract.pytesseract.tesseract_cmd = r"C:\Users\HPO2KOR\AppData\Local\Tesseract-OCR\tesseract.exe"

image = cv2.imread(r'C:\Users\HPO2KOR\Desktop\Work\venv\Patent\PARTICULATE DETECTOR\PD4.png')
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
clean = thresh.copy()

horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15,1))
detect_horizontal = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
cnts = cv2.findContours(detect_horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
    cv2.drawContours(clean, [c], -1, 0, 3)

vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,30))
detect_vertical = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
cnts = cv2.findContours(detect_vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
    cv2.drawContours(clean, [c], -1, 0, 3)

cnts = cv2.findContours(clean, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
    area = cv2.contourArea(c)
    if area < 100:
        cv2.drawContours(clean, [c], -1, 0, 3)
    elif area > 1000:
        cv2.drawContours(clean, [c], -1, 0, -1)
    peri = cv2.arcLength(c, True)
    approx = cv2.approxPolyDP(c, 0.02 * peri, True)
    x,y,w,h = cv2.boundingRect(c)
    if len(approx) == 4:
        cv2.rectangle(clean, (x, y), (x + w, y + h), 0, -1)

open_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2,2))
opening = cv2.morphologyEx(clean, cv2.MORPH_OPEN, open_kernel, iterations=2)
close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,2))
close = cv2.morphologyEx(opening, cv2.MORPH_CLOSE, close_kernel, iterations=4)
cnts = cv2.findContours(close, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
    x,y,w,h = cv2.boundingRect(c)
    area = cv2.contourArea(c)
    if area > 500:
        ROI = image[y:y+h, x:x+w]
        ROI = cv2.GaussianBlur(ROI, (3,3), 0)
        data = pytesseract.image_to_string(ROI, lang='eng',config='--psm 6')
        if data.isalnum():
            cv2.rectangle(image, (x, y), (x + w, y + h), (36,255,12), 2)
            print(data)

cv2.imwrite('image.png', image)
cv2.imwrite('clean.png', clean)
cv2.imwrite('close.png', close)
cv2.imwrite('opening.png', opening)
cv2.waitKey()

Is there any model or any opencv technique or some pre trained model that can do the same for me? I just need the bounding boxes around all the alphanumeric characters present in the images. After that I need to identify whats present in it. However the second part is not important currently.

解决方案

A possible approach is to use the EAST (Efficient and Accurate Scene Text) deep learning text detector based on Zhou et al.’s 2017 paper, EAST: An Efficient and Accurate Scene Text Detector. The model was originally trained for detecting text in natural scene images but it may be possible to apply it on diagram images. EAST is quite robust and is capable of detecting blurred or reflective text. Here is a modified version of Adrian Rosebrock's implementation of EAST. Instead of applying the text detector directly on the image, we can try to remove as much non-text objects on the image before performing text detection. The idea is to remove horizontal lines, vertical lines, and non-text contours (curves, diagonals, circular shapes) before applying detection. Here's the results with some of your images:

Input -> Non-text contours to remove in green

Result

Other images

The pretrained frozen_east_text_detection.pb model necessary to perform text detection can be found here. Although the model catches most of the text, the results are not 100% accurate and has occasional false positives probably due to how it was trained on natural scene images. To obtain more accurate results you would probably have to train your own custom model. But if you want a decent out-of-the-box solution then this should work you. Check out Adrian's OpenCV Text Detection (EAST text detector) blog post for a more comprehensive explanation of the EAST text detector.

Code

from imutils.object_detection import non_max_suppression
import numpy as np
import cv2

def EAST_text_detector(original, image, confidence=0.25):
    # Set the new width and height and determine the changed ratio
    (h, W) = image.shape[:2]
    (newW, newH) = (640, 640)
    rW = W / float(newW)
    rH = h / float(newH)

    # Resize the image and grab the new image dimensions
    image = cv2.resize(image, (newW, newH))
    (h, W) = image.shape[:2]

    # Define the two output layer names for the EAST detector model that
    # we are interested -- the first is the output probabilities and the
    # second can be used to derive the bounding box coordinates of text
    layerNames = [
        "feature_fusion/Conv_7/Sigmoid",
        "feature_fusion/concat_3"]

    net = cv2.dnn.readNet('frozen_east_text_detection.pb')

    # Construct a blob from the image and then perform a forward pass of
    # the model to obtain the two output layer sets
    blob = cv2.dnn.blobFromImage(image, 1.0, (W, h), (123.68, 116.78, 103.94), swapRB=True, crop=False)
    net.setInput(blob)
    (scores, geometry) = net.forward(layerNames)

    # Grab the number of rows and columns from the scores volume, then
    # initialize our set of bounding box rectangles and corresponding
    # confidence scores
    (numRows, numCols) = scores.shape[2:4]
    rects = []
    confidences = []

    # Loop over the number of rows
    for y in range(0, numRows):
        # Extract the scores (probabilities), followed by the geometrical
        # data used to derive potential bounding box coordinates that
        # surround text
        scoresData = scores[0, 0, y]
        xData0 = geometry[0, 0, y]
        xData1 = geometry[0, 1, y]
        xData2 = geometry[0, 2, y]
        xData3 = geometry[0, 3, y]
        anglesData = geometry[0, 4, y]

        # Loop over the number of columns
        for x in range(0, numCols):
            # If our score does not have sufficient probability, ignore it
            if scoresData[x] < confidence:
                continue

            # Compute the offset factor as our resulting feature maps will
            # be 4x smaller than the input image
            (offsetX, offsetY) = (x * 4.0, y * 4.0)

            # Extract the rotation angle for the prediction and then
            # compute the sin and cosine
            angle = anglesData[x]
            cos = np.cos(angle)
            sin = np.sin(angle)

            # Use the geometry volume to derive the width and height of
            # the bounding box
            h = xData0[x] + xData2[x]
            w = xData1[x] + xData3[x]

            # Compute both the starting and ending (x, y)-coordinates for
            # the text prediction bounding box
            endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
            endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
            startX = int(endX - w)
            startY = int(endY - h)

            # Add the bounding box coordinates and probability score to
            # our respective lists
            rects.append((startX, startY, endX, endY))
            confidences.append(scoresData[x])

    # Apply non-maxima suppression to suppress weak, overlapping bounding
    # boxes
    boxes = non_max_suppression(np.array(rects), probs=confidences)

    # Loop over the bounding boxes
    for (startX, startY, endX, endY) in boxes:
        # Scale the bounding box coordinates based on the respective
        # ratios
        startX = int(startX * rW)
        startY = int(startY * rH)
        endX = int(endX * rW)
        endY = int(endY * rH)

        # Draw the bounding box on the image
        cv2.rectangle(original, (startX, startY), (endX, endY), (36, 255, 12), 2)
    return original

# Convert to grayscale and Otsu's threshold
image = cv2.imread('1.png')
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
clean = thresh.copy()

# Remove horizontal lines
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15,1))
detect_horizontal = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
cnts = cv2.findContours(detect_horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
    cv2.drawContours(clean, [c], -1, 0, 3)

# Remove vertical lines
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,30))
detect_vertical = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
cnts = cv2.findContours(detect_vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
    cv2.drawContours(clean, [c], -1, 0, 3)

# Remove non-text contours (curves, diagonals, circlar shapes)
cnts = cv2.findContours(clean, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
    area = cv2.contourArea(c)
    if area > 1500:
        cv2.drawContours(clean, [c], -1, 0, -1)
    peri = cv2.arcLength(c, True)
    approx = cv2.approxPolyDP(c, 0.02 * peri, True)
    x,y,w,h = cv2.boundingRect(c)
    if len(approx) == 4:
        cv2.rectangle(clean, (x, y), (x + w, y + h), 0, -1)

# Bitwise-and with original image to remove contours
filtered = cv2.bitwise_and(image, image, mask=clean)
filtered[clean==0] = (255,255,255)

# Perform EAST text detection
result = EAST_text_detector(image, filtered)

cv2.imshow('filtered', filtered)
cv2.imshow('result', result)
cv2.waitKey()

这篇关于使用YOLO或其他图像识别技术来识别图像中存在的所有字母数字文本的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆