Google Vision Api Structure(Text Hierarchy) issue

13 views Asked by At

I am using Google Vision Api to extract the text from a cheque image. It is extracting the all the data but not in structured form. It extracts text using the text hirarchy technique. the main issue i am getting is that when use the code given by the google vision api, it print the data in different and weird structure. when i use vision api free trail on the website, it print the data in a good structure. Secondly, block, paragraph and words structure should have different output but the code is printing the same structure in all format(block, paragraph and words). Can anyone help me out. Here is the code that i am using:

from enum import Enum

from google.cloud import vision
from PIL import Image, ImageDraw
import os
##api key


class FeatureType(Enum):
    PAGE = 1
    BLOCK = 2
    PARA = 3
    WORD = 4
    SYMBOL = 5


def draw_boxes(image, bounds, color):
    draw = ImageDraw.Draw(image)

    for bound in bounds:
        draw.polygon(
            [
                bound.vertices[0].x,
                bound.vertices[0].y,
                bound.vertices[1].x,
                bound.vertices[1].y,
                bound.vertices[2].x,
                bound.vertices[2].y,
                bound.vertices[3].x,
                bound.vertices[3].y,
            ],
            None,
            color,
        )
    return image


def get_document_bounds(image_file, feature):
    client = vision.ImageAnnotatorClient()
    bounds = []

    with open(image_file, "rb") as image_file:
        content = image_file.read()

    image = vision.Image(content=content)

    response = client.document_text_detection(image=image)
    document = response.full_text_annotation

    recognized_text = ""

    # Collect specified feature bounds by enumerating all document features
    # Collect specified feature bounds by enumerating all document features
    for page in document.pages:
        for block in page.blocks:
            for paragraph in block.paragraphs:
                paragraph_text = ""
                for word in paragraph.words:
                    word_text = ""
                    for symbol in word.symbols:
                        if feature == FeatureType.SYMBOL:
                            bounds.append(symbol.bounding_box)
                            recognized_text += symbol.text

                        if feature == FeatureType.WORD:
                            bounds.append(word.bounding_box)
                            word_text += symbol.text

                        paragraph_text += symbol.text

                    if feature == FeatureType.WORD:
                        recognized_text += word_text + " "

                if feature == FeatureType.PARA:
                    bounds.append(paragraph.bounding_box)
                    recognized_text += paragraph_text + " "

            if feature == FeatureType.BLOCK:
                bounds.append(block.bounding_box)
                recognized_text += paragraph_text + " "

    # The list `bounds` contains the coordinates of the bounding boxes.
    return bounds, recognized_text


def render_doc_text(filein):
    image = Image.open(filein)
    bounds, recognized_text = get_document_bounds(filein, FeatureType.BLOCK)
    draw_boxes(image, bounds, "blue")
    print("Extracted Text from BLOCKs:", recognized_text)

    bounds, recognized_text = get_document_bounds(filein, FeatureType.PARA)
    draw_boxes(image, bounds, "red")
    print("Extracted Text from PARAs:", recognized_text)

    bounds, recognized_text = get_document_bounds(filein, FeatureType.WORD)
    draw_boxes(image, bounds, "yellow")
    print("Extracted Text from WORDs:", recognized_text)

    image.show()


if __name__ == "__main__":
    input_image_path = r'image path'



    render_doc_text(input_image_path)

I am trying to get the output in a good structure.

0

There are 0 answers