如何标记和检测文档文本图像

数据挖掘 神经网络 卷积
2022-02-25 04:47:33

这就是我所说的文档文本图像: 在此处输入图像描述

我想将图像中的文本标记为单独的块,我的模型应该将这些标签检测为类。

笔记:

最终结果应该是这样的:在此处输入图像描述

Block 1、Block 2、Block 3、.. 等标签应为 Logo、Title、Date、.. Others 等。

完成工作:

第一种方法:我尝试通过对象检测来实现此方法,但没有成功。它甚至没有检测到任何文本。

第二种方法:然后我尝试使用PixelLink由于该模型是为场景文本检测而构建的,因此它检测到了图像中的每一个文本。但是如果增加阈值,这种方法可以检测到多行文本。但我不知道如何为文本块添加标签。

PIXEL_CLS_WEIGHT_all_ones = 'PIXEL_CLS_WEIGHT_all_ones' 
PIXEL_CLS_WEIGHT_bbox_balanced = 'PIXEL_CLS_WEIGHT_bbox_balanced'
PIXEL_NEIGHBOUR_TYPE_4 = 'PIXEL_NEIGHBOUR_TYPE_4'
PIXEL_NEIGHBOUR_TYPE_8 = 'PIXEL_NEIGHBOUR_TYPE_8'

DECODE_METHOD_join = 'DECODE_METHOD_join'


def get_neighbours_8(x, y):
    """
    Get 8 neighbours of point(x, y)
    """
    return [(x - 1, y - 1), (x, y - 1), (x + 1, y - 1), \
        (x - 1, y),                 (x + 1, y),  \
        (x - 1, y + 1), (x, y + 1), (x + 1, y + 1)]


def get_neighbours_4(x, y):
    return [(x - 1, y), (x + 1, y), (x, y + 1), (x, y - 1)]


def get_neighbours(x, y):
    import config
    neighbour_type = config.pixel_neighbour_type
    if neighbour_type == PIXEL_NEIGHBOUR_TYPE_4:
        return get_neighbours_4(x, y)
    else:
        return get_neighbours_8(x, y)

def get_neighbours_fn():
    import config
    neighbour_type = config.pixel_neighbour_type
    if neighbour_type == PIXEL_NEIGHBOUR_TYPE_4:
        return get_neighbours_4, 4
    else:
        return get_neighbours_8, 8



def is_valid_cord(x, y, w, h):
    """
    Tell whether the 2D coordinate (x, y) is valid or not.
    If valid, it should be on an h x w image
    """
    return x >=0 and x < w and y >= 0 and y < h;

#=====================Ground Truth Calculation Begin==================
def tf_cal_gt_for_single_image(xs, ys, labels):
    pixel_cls_label, pixel_cls_weight,  \
    pixel_link_label, pixel_link_weight = \
        tf.py_func(
                    cal_gt_for_single_image, 
                    [xs, ys, labels],
                    [tf.int32, tf.float32, tf.int32, tf.float32]
                   )
    import config
    score_map_shape = config.score_map_shape
    num_neighbours = config.num_neighbours
    h, w = score_map_shape
    pixel_cls_label.set_shape(score_map_shape)
    pixel_cls_weight.set_shape(score_map_shape)
    pixel_link_label.set_shape([h, w, num_neighbours])
    pixel_link_weight.set_shape([h, w, num_neighbours])
    return pixel_cls_label, pixel_cls_weight, \
            pixel_link_label, pixel_link_weight


def cal_gt_for_single_image(normed_xs, normed_ys, labels):
    """
    Args:
        xs, ys: both in shape of (N, 4), 
            and N is the number of bboxes,
            their values are normalized to [0,1]
        labels: shape = (N,), only two values are allowed:
                                                        -1: ignored
                                                        1: text
    Return:
        pixel_cls_label
        pixel_cls_weight
        pixel_link_label
        pixel_link_weight
    """
    import config
    score_map_shape = config.score_map_shape
    pixel_cls_weight_method  = config.pixel_cls_weight_method
    h, w = score_map_shape
    text_label = config.text_label
    ignore_label = config.ignore_label
    background_label = config.background_label
    num_neighbours = config.num_neighbours
    bbox_border_width = config.bbox_border_width
    pixel_cls_border_weight_lambda = config.pixel_cls_border_weight_lambda

    # validate the args
    assert np.ndim(normed_xs) == 2
    assert np.shape(normed_xs)[-1] == 4
    assert np.shape(normed_xs) == np.shape(normed_ys)
    assert len(normed_xs) == len(labels)

#     assert set(labels).issubset(set([text_label, ignore_label, background_label]))

    num_positive_bboxes = np.sum(np.asarray(labels) == text_label)
    # rescale normalized xys to absolute values
    xs = normed_xs * w
    ys = normed_ys * h

    # initialize ground truth values
    mask = np.zeros(score_map_shape, dtype = np.int32)
    pixel_cls_label = np.ones(score_map_shape, dtype = np.int32) * background_label
    pixel_cls_weight = np.zeros(score_map_shape, dtype = np.float32)

    pixel_link_label = np.zeros((h, w, num_neighbours), dtype = np.int32)
    pixel_link_weight = np.ones((h, w, num_neighbours), dtype = np.float32)

    # find overlapped pixels, and consider them as ignored in pixel_cls_weight
    # and pixels in ignored bboxes are ignored as well
    # That is to say, only the weights of not ignored pixels are set to 1

    ## get the masks of all bboxes
    bbox_masks = []
    pos_mask = mask.copy()
    for bbox_idx, (bbox_xs, bbox_ys) in enumerate(zip(xs, ys)):
        if labels[bbox_idx] == background_label:
            continue

        bbox_mask = mask.copy()

        bbox_points = zip(bbox_xs, bbox_ys)
        bbox_contours = util.img.points_to_contours(bbox_points)
        util.img.draw_contours(bbox_mask, bbox_contours, idx = -1, 
                               color = 1, border_width = -1)

        bbox_masks.append(bbox_mask)

        if labels[bbox_idx] == text_label:
            pos_mask += bbox_mask

    # treat overlapped in-bbox pixels as negative, 
    # and non-overlapped  ones as positive
    pos_mask = np.asarray(pos_mask == 1, dtype = np.int32)
    num_positive_pixels = np.sum(pos_mask)

    ## add all bbox_maskes, find non-overlapping pixels
    sum_mask = np.sum(bbox_masks, axis = 0)
    not_overlapped_mask = sum_mask == 1


    ## gt and weight calculation
    for bbox_idx, bbox_mask in enumerate(bbox_masks):
        bbox_label = labels[bbox_idx]
        if bbox_label == ignore_label:
            # for ignored bboxes, only non-overlapped pixels are encoded as ignored 
            bbox_ignore_pixel_mask = bbox_mask * not_overlapped_mask
            pixel_cls_label += bbox_ignore_pixel_mask * ignore_label
            continue

        if labels[bbox_idx] == background_label:
            continue
        # from here on, only text boxes left.

        # for positive bboxes, all pixels within it and pos_mask are positive
        bbox_positive_pixel_mask = bbox_mask * pos_mask
        # background or text is encoded into cls gt
        pixel_cls_label += bbox_positive_pixel_mask * bbox_label

        # for the pixel cls weights, only positive pixels are set to ones
        if pixel_cls_weight_method == PIXEL_CLS_WEIGHT_all_ones:
            pixel_cls_weight += bbox_positive_pixel_mask
        elif pixel_cls_weight_method == PIXEL_CLS_WEIGHT_bbox_balanced:
            # let N denote num_positive_pixels
            # weight per pixel = N /num_positive_bboxes / n_pixels_in_bbox
            # so all pixel weights in this bbox sum to N/num_positive_bboxes
            # and all pixels weights in this image sum to N, the same
            # as setting all weights to 1
            num_bbox_pixels = np.sum(bbox_positive_pixel_mask)
            if num_bbox_pixels > 0: 
                per_bbox_weight = num_positive_pixels * 1.0 / num_positive_bboxes
                per_pixel_weight = per_bbox_weight / num_bbox_pixels
                pixel_cls_weight += bbox_positive_pixel_mask * per_pixel_weight
        else:
            raise ValueError, 'pixel_cls_weight_method not supported:%s'\
                        %(pixel_cls_weight_method)


        ## calculate the labels and weights of links
        ### for all pixels in  bboxes, all links are positive at first
        bbox_point_cords = np.where(bbox_positive_pixel_mask)
        pixel_link_label[bbox_point_cords] = 1


        ## the border of bboxes might be distored because of overlapping
        ## so recalculate it, and find the border mask        
        new_bbox_contours = util.img.find_contours(bbox_positive_pixel_mask)
        bbox_border_mask = mask.copy()
        util.img.draw_contours(bbox_border_mask, new_bbox_contours, -1, 
                   color = 1, border_width = bbox_border_width * 2 + 1)
        bbox_border_mask *= bbox_positive_pixel_mask
        bbox_border_cords = np.where(bbox_border_mask)

        ## give more weight to the border pixels if configured
        pixel_cls_weight[bbox_border_cords] *= pixel_cls_border_weight_lambda

        ### change link labels according to their neighbour status
        border_points = zip(*bbox_border_cords)
        def in_bbox(nx, ny):
            return bbox_positive_pixel_mask[ny, nx]

        for y, x in border_points:
            neighbours = get_neighbours(x, y)
            for n_idx, (nx, ny) in enumerate(neighbours):
                if not is_valid_cord(nx, ny, w, h) or not in_bbox(nx, ny):
                    pixel_link_label[y, x, n_idx] = 0

    pixel_cls_weight = np.asarray(pixel_cls_weight, dtype = np.float32)    
    pixel_link_weight *= np.expand_dims(pixel_cls_weight, axis = -1)

#     try:
#         np.testing.assert_almost_equal(np.sum(pixel_cls_weight), num_positive_pixels, decimal = 1)
#     except:
#         print  num_positive_pixels, np.sum(pixel_cls_label), np.sum(pixel_cls_weight)
#         import pdb
#         pdb.set_trace()
    return pixel_cls_label, pixel_cls_weight, pixel_link_label, pixel_link_weight

#=====================Ground Truth Calculation End====================


#============================Decode Begin=============================

def tf_decode_score_map_to_mask_in_batch(pixel_cls_scores, pixel_link_scores):
    masks = tf.py_func(decode_batch, 
                       [pixel_cls_scores, pixel_link_scores], tf.int32)
    b, h, w = pixel_cls_scores.shape.as_list()
    masks.set_shape([b, h, w])
    return masks



def decode_batch(pixel_cls_scores, pixel_link_scores, 
                 pixel_conf_threshold = None, link_conf_threshold = None):
    import config

    if pixel_conf_threshold is None:
        pixel_conf_threshold = config.pixel_conf_threshold

    if link_conf_threshold is None:
        link_conf_threshold = config.link_conf_threshold

    batch_size = pixel_cls_scores.shape[0]
    batch_mask = []
    for image_idx in xrange(batch_size):
        image_pos_pixel_scores = pixel_cls_scores[image_idx, :, :]
        image_pos_link_scores = pixel_link_scores[image_idx, :, :, :]    
        mask = decode_image(
            image_pos_pixel_scores, image_pos_link_scores, 
            pixel_conf_threshold, link_conf_threshold
        )
        batch_mask.append(mask)
    return np.asarray(batch_mask, np.int32)

# @util.dec.print_calling_in_short
# @util.dec.timeit
def decode_image(pixel_scores, link_scores, 
                 pixel_conf_threshold, link_conf_threshold):
    import config
    if config.decode_method == DECODE_METHOD_join:
        mask =  decode_image_by_join(pixel_scores, link_scores, 
                 pixel_conf_threshold, link_conf_threshold)
        return mask
    elif config.decode_method == DECODE_METHOD_border_split:
        return decode_image_by_border(pixel_scores, link_scores, 
                 pixel_conf_threshold, link_conf_threshold)
    else:
        raise ValueError('Unknow decode method:%s'%(config.decode_method))


import pyximport; pyximport.install()    
from pixel_link_decode import decode_image_by_join

def min_area_rect(cnt):
    """
    Args:
        xs: numpy ndarray with shape=(N,4). N is the number of oriented bboxes. 4 contains [x1, x2, x3, x4]
        ys: numpy ndarray with shape=(N,4), [y1, y2, y3, y4]
            Note that [(x1, y1), (x2, y2), (x3, y3), (x4, y4)] can represent an oriented bbox.
    Return:
        the oriented rects sorrounding the box, in the format:[cx, cy, w, h, theta]. 
    """
    rect = cv2.minAreaRect(cnt)
    cx, cy = rect[0]
    w, h = rect[1]
    theta = rect[2]
    box = [cx, cy, w, h, theta]
    return box, w * h

def rect_to_xys(rect, image_shape):
    """Convert rect to xys, i.e., eight points
    The `image_shape` is used to to make sure all points return are valid, i.e., within image area
    """
    h, w = image_shape[0:2]
    def get_valid_x(x):
        if x < 0:
            return 0
        if x >= w:
            return w - 1
        return x

    def get_valid_y(y):
        if y < 0:
            return 0
        if y >= h:
            return h - 1
        return y

    rect = ((rect[0], rect[1]), (rect[2], rect[3]), rect[4])
    points = cv2.cv.BoxPoints(rect)
    points = np.int0(points)
    for i_xy, (x, y) in enumerate(points):
        x = get_valid_x(x)
        y = get_valid_y(y)
        points[i_xy, :] = [x, y]
    points = np.reshape(points, -1)
    return points

# @util.dec.print_calling_in_short
# @util.dec.timeit
def mask_to_bboxes(mask, image_shape =  None, min_area = None, 
                   min_height = None, min_aspect_ratio = None):
    import config
    feed_shape = config.train_image_shape

    if image_shape is None:
        image_shape = feed_shape

    image_h, image_w = image_shape[0:2]

    if min_area is None:
        min_area = config.min_area

    if min_height is None:
        min_height = config.min_height
    bboxes = []
    max_bbox_idx = mask.max()
    mask = util.img.resize(img = mask, size = (image_w, image_h), 
                           interpolation = cv2.INTER_NEAREST)

    for bbox_idx in xrange(1, max_bbox_idx + 1):
        bbox_mask = mask == bbox_idx
#         if bbox_mask.sum() < 10:
#             continue
        cnts = util.img.find_contours(bbox_mask)
        if len(cnts) == 0:
            continue
        cnt = cnts[0]
        rect, rect_area = min_area_rect(cnt)

        w, h = rect[2:-1]
        if min(w, h) < min_height:
            continue

        if rect_area < min_area:
            continue

#         if max(w, h) * 1.0 / min(w, h) < 2:
#             continue
        xys = rect_to_xys(rect, image_shape)
        bboxes.append(xys)

    return bboxes

有什么建议么?

有没有更适合我要解决的问题的方法?

2个回答

试试谷歌的tesseract他们在内部使用它。这里这里都有博客文章

如果你想要这样的东西,你可以尝试使用已经存在的在线应用程序来测试

如果这是您想要的,您可以创建一个:

  1. 首先定位文本的边界

  2. 将其转换为文本

这是相当复杂的,但您仍然可以使用所谓的 OCR(光学字符识别),因此在 github 中搜索它并使用开源项目作为这样的启动