Extract data from unorganized pdf table on Python

43 views Asked by user23531309 At 04 March 2024 at 22:21

I'm trying to write a script that extracts some data from a pdf file ([example]

(https://drive.google.com/file/d/1kvhLPgdHHvluTtdVUIQ1GTDjiUFQDgSA/view?usp=drive_link))

The data I need to extract is:

"SERVICIOS AGROPECUARIOS CUYO SA", "HUMBERTO PRIMO N° 251/0 Piso: 10 Depto: 1 CP: (5800) - RIO CUARTO - CORDOBA", "7.564,40", "30/06/2022", "04/2012", "10/2021", "30711982333" and "2001792"

I've tried this 2 approaches but neither is not working.

import fitz  # PyMuPDF
from docx import Document
import os

current_directory = os.getcwd()
ruta_pdf = current_directory

def extraer_datos_pdf(ruta_pdf):
    documento = fitz.open(ruta_pdf)
    texto_completo = ""
    for pagina in documento:
        texto_completo += pagina.get_text()
    nombre = "Nombre no encontrado"
    domicilio = "Domicilio no encontrado"
    numero_documento = "123"

    for pagina in documento:
        for tabla in pagina.get_text("dict")["blocks"]:
            if tabla["type"] == 0 and tabla["bbox"] is not None:
                for linea in tabla["lines"]:
                    for palabra in linea["spans"]:
                        texto = palabra["text"].strip()
                        if texto.startswith("EMPRESA"):
                            nombre = texto.split(":", 1)[1].strip()
                        elif texto.startswith("DOMICILIO"):
                            domicilio = texto.split(":", 1)[1].strip()
                        elif texto.startswith("ACTA N"):
                            numero_documento = texto.split("°", 1)[1].strip()

    return nombre, domicilio, numero_documento

import fitz  # PyMuPDF
from docx import Document
import os
import camelot
import tabula

current_directory = os.getcwd()
directorio = current_directory
template_path = os.path.join(current_directory, "modelo.docx")  
ruta_pdf = current_directory

def extraer_datos_pdf(ruta_pdf):
    tables = camelot.read_pdf(ruta_pdf, flavor='stream')
    nombre = "Nombre no encontrado"
    domicilio = "Domicilio no encontrado"
    numero_documento = "123"

    if tables:
        for table in tables:
            for index, row in table.df.iterrows():
                for col_index, cell in enumerate(row):
                    if "EMPRESA" in cell:
                        nombre = row[col_index + 1] if col_index + 1 < len(row) else "Valor no encontrado"
                    elif "DOMICILIO" in cell:
                        domicilio = row[col_index + 1] if col_index + 1 < len(row) else "Valor no encontrado"
                    elif "ACTA N°" in cell:
                        numero_documento = row[col_index + 1] if col_index + 1 < len(row) else "Valor no encontrado"

    return nombre, domicilio, numero_documento

Original Q&A

TechQA.

Extract data from unorganized pdf table on Python

There are 0 answers

Related Questions in PYTHON

Related Questions in PYMUPDF

Popular Questions

Trending Questions