How to Find Text Coordinates for a Given Character Range in a PDF using PyMuPDF?

101 views Asked by At

I am working on a project where I want to extract text coordinates for a specific character range within a PDF document using PyMuPDF. I have a PDF file and a character range defined by the start and end indices. I want to locate the exact position (coordinates) of the text within this character range on the PDF.

An example: The text of the pdf is:

Erste Verordnung zum Sprengstoffgesetz in der Fassung der Bekanntmachung vom 31. Januar 1991 (BGBl. I S. 169), die zuletzt durch Artikel 1 der Verordnung vom 20. Dezember 2021 (BGBl. I S. 5238) geƤndert worden ist. Das Sprengstoffgesetz ist anzuwenden.

I have the character range from 21 to 38, which in this case represents the phrase Sprengstoffgesetz. I tried using the search_for function, but it gives me all instances of the term, not just the term in the character range.

Furthermore, the character range might not be a full for word, but only a part. For example the range from 32 to 38, which represents just the part gesetz.

Is there a way to find the coordinates of the given range.

1

There are 1 answers

0
Mazze On

I may have a solution that involves a few steps. I haven't fully tested it and it may still have some bugs. One problem is that it takes a long time if the PDF is large and the character range is at the end of the PDF.

  • First, I try to find the blocks that contain the character range and combine them into a block rectangle. Since it might be on more than one page, it is split into parts for each page.
  • Second, I use this range as a clip parameter to find the words and iterate through the words to find the rectangles that contain parts of the term.
  • Finally, I combine the rectangles found that are on the same line. So in the end I have single rectangles per page and line that contain the range of characters.

`import fitz

def find_blocks(pdf_file: str, start_char: int, end_char: int):
    """
    Find the blocks containing the character range. Return the block rectangle with the corresponding page number.
        :param pdf_file: Path to the PDF file.
        :param start_char: Start character of the range.
        :param end_char: End character of the range.
    """
    dict_blocks = {}
    block_text = ""
    with fitz.open(pdf_file) as doc:
        for page in doc:
            blocks = page.get_text("blocks")
            for block in blocks:
                block_text += block[4]
                if len(block_text) > start_char:
                    if page.number in dict_blocks:
                        dict_blocks[page.number].append(block[:4])
                    else:
                        dict_blocks[page.number] = [block[:4]]
                if len(block_text) > end_char:
                    return dict_blocks

def find_text_coordinates_on_page(pdf_file: str, page_number: int, block_box: fitz.Rect, term: str):
    """
    Find the exact coordinates of the text given by the character range.
        :param pdf_file: Path to the PDF file.
        :param page_number: Page number of the PDF file.
        :param block_box: The block rectangle containing the character range.
        :param term: The text to find the coordinates for.
    """
    with fitz.open(pdf_file) as doc:
        page = doc[page_number]
        words = page.get_text("words", clip=block_box)
        term = term.replace("\n", " ")
        term_list = term.split(" ")
        term_list = [term for term in term_list if term]
        text_coordinates = []
        if not term_list:
            return text_coordinates
        # iterate through the words and find the coordinates of the term
        for word in words:
            if term_list[0] in word[4]:  # check if the first word in the term is in the word
                term_found = True
                text_coordinates.append(fitz.Rect(word[:4]))
                if len(term_list) == 1:  # if the term has only one word, return the coordinates
                    return text_coordinates
                # check if the next words in the term are in the next words
                index = words.index(word)
                for i in range(1, len(term_list)):
                    if index + i >= len(words):  # if the index is out of range, break the loop
                        term_found = False
                        break
                    text_coordinates.append(fitz.Rect(words[index + i][:4]))
                    if term_list[i] not in words[index + i][4]:  # if the word is not in the next word, break the loop
                        text_coordinates = []
                        term_found = False
                        break
                if term_found:  # if the term is found, return the coordinates
                    return text_coordinates
        return text_coordinates


def combine_rects(rects: list):
    """
    Function to combine multiple rectangles into one.
        :param rects: List of rectangles to combine.
    """
    x0 = min(rect[0] for rect in rects)
    y0 = min(rect[1] for rect in rects)
    x1 = max(rect[2] for rect in rects)
    y1 = max(rect[3] for rect in rects)
    return fitz.Rect(x0, y0, x1, y1)


def is_same_line(rect1: fitz.Rect, rect2: fitz.Rect, tolerance=5):
    """Check if two rectangles are on the same line.
        :param rect1: the first rectangle
        :param rect2: the second rectangle
        :param tolerance: the tolerance value
        :return: True if the rectangles are on the same line, False otherwise"""
    # Check if two rectangles are on the same line with a tolerance value
    return abs(rect1.y0 - rect2.y0) < tolerance  # return True if the rectangles are on the same line


def combine_rectangles_on_same_line(rectangles: list,):
    """Combine rectangles on the same line.
        :param rectangles: the rectangles to combine
        :return: the combined rectangles"""
    result = []  # list of combined rectangles
    grouped_rectangles = []  # list of grouped rectangles
    print(rectangles)
    rectangles.sort(key=lambda rect: rect.y0)  # sort rectangles by their y-coordinate

    for rect in rectangles:  # iterate over all rectangles
        # If grouped_rectangles is empty or the current rect is on the same line and does not contain the same word
        if not grouped_rectangles or (is_same_line(grouped_rectangles[-1], rect)):
            grouped_rectangles.append(rect)
        else:
            # Combine rectangles on the same line with different text
            combined_rect = combine_rects(grouped_rectangles)
            result.append(combined_rect)
            grouped_rectangles = [rect]

    # Add the last group of rectangles
    if grouped_rectangles:
        combined_rect = combine_rects(grouped_rectangles)
        result.append(combined_rect)

    return result  # return the combined rectangles

def combine_functions(start_char: int, end_char: int, pdf_path: str):
    """
    Combine the functions to find the coordinates for a given character range.
        :param start_char: Start character of the range.
        :param end_char: End character of the range.
        :param pdf_path: Path to the PDF file.
        :return: List of combined rectangles.
    """
    # Open the PDF file and extract the text
    pdf_text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            pdf_text += page.get_text()
    term = pdf_text[start_char:end_char]
    # Find the blocks containing the character range for each page (term might be on multiple pages)
    dict_blocks = find_blocks(pdf_path, start_char, end_char)
    combined_rects = []
    # Find the text coordinates for the term (for each part of the term on the page) and combine them to one rectangle
    for page_number, span_boxes in dict_blocks.items():
        combined_block_boxes = combine_rects(span_boxes)  # combine the block boxes on the same page
        # find the text coordinates for the part of the term that on the page
        text_coordinates = find_text_coordinates_on_page(pdf_path, page_number, combined_block_boxes, term)
        if not text_coordinates:
            continue
        combined_rect_on_line = combine_rectangles_on_same_line(text_coordinates)
        combined_rects.append(combined_rect_on_line)

    return combined_rects

path = "path/to/file"
start_char = 52
end_char = 110
combined_rects = combine_functions(start_char, end_char, path)`