I am new to Python and I am developing a program that takes a PDF file as input and converts it into text, I am using Python 3 and tried the PyPDF2 and PDFMiner.six packages.
For the first PDF file it convert well and print it on console, but when I use another PDF file which contains some empty pages, it prints out the text, and when it reach the empty page, these error appears,
Here is the code:
PyPDF2
import PyPDF2
def extract_txt_pdf(pdf_file:str) ->[str]:
# open the file and read it as bit
with open(pdf_file, 'rb') as pdf:
reader = PyPDF2.PdfReader(pdf,strict = False)
pdf_text= []
for page in reader.pages:
content = page.extract_text()
pdf_text.append(content)
return pdf_text
PDFMiner.six
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,
password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
and Here is the error for pdfminer
print(t)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2032.0_x64__qbz5n2kfra8p0\Lib\encodings\cp1252.py", line 19, in encode return codecs.charmap_encode(input,self.errors,encoding_table)[0]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeEncodeError: 'charmap' codec can't encode character '\u25b6' in position 0:
character maps to <undefined>