I have some code that parses through tables in a PDF file and converts them to CSVs.
import csv
import pdfplumber
def process_my_PDF(pdf_path: str) -> {}:
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
tables = page.extract_tables()
with open('/tmp/test.csv', 'a', newline='') as file:
for table in tables:
for row in table:
print(row)
writer = csv.writer(file)
writer.writerow(row)
pdf_path = "/tmp/foo.pdf"
process_my_PDF(pdf_path=pdf_path)
The PDF also has bold, italicized and underlined text in the table's cells and I need to preserve that information in the resulting csv (I have a reader app that will render it) via html style markups around the text that I extract.
How do I detect the formatting in the table text?