Issue with using textract.process to extract text using the pdfminer method

46 views Asked by At

Here is the code from the Jupyter notebook that takes the pdf files in the data folder, and extracts the text using the line text = textract.process(pdf_path, method='pdfminer').

tokenizer = tiktoken.get_encoding("cl100k_base")

# Process each PDF file and prepare for embedding
for pdf_file in pdf_files:    
    pdf_path = os.path.join(data_dir,pdf_file)
    print(pdf_path)
    
    # # Extract the raw text from each PDF using textract
    text = textract.process(pdf_path, method='pdfminer')

I'm getting the following stack trace as a result, it seems like the filename isn't getting passed properly at some point during this process. I've spent several hours looking at this and haven't made any progress:

---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
File c:\Python311\Lib\site-packages\textract\parsers\pdf_parser.py:54, in Parser.extract_pdfminer(self, filename, **kwargs)
     53 try:
---> 54     stdout, _ = self.run(['pdf2txt.py', filename])
     55 except OSError:

File c:\Python311\Lib\site-packages\textract\parsers\utils.py:87, in ShellParser.run(self, args)
     86 try:
---> 87     pipe = subprocess.Popen(
     88         args,
     89         stdout=subprocess.PIPE, stderr=subprocess.PIPE,
     90     )
     91 except OSError as e:

File c:\Python311\Lib\subprocess.py:1024, in Popen.__init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, user, group, extra_groups, encoding, errors, text, umask, pipesize, process_group)
   1021             self.stderr = io.TextIOWrapper(self.stderr,
   1022                     encoding=encoding, errors=errors)
-> 1024     self._execute_child(args, executable, preexec_fn, close_fds,
   1025                         pass_fds, cwd, env,
   1026                         startupinfo, creationflags, shell,
   1027                         p2cread, p2cwrite,
   1028                         c2pread, c2pwrite,
   1029                         errread, errwrite,
   1030                         restore_signals,
   1031                         gid, gids, uid, umask,
   1032                         start_new_session, process_group)
   1033 except:
   1034     # Cleanup if the child failed starting.

File c:\Python311\Lib\subprocess.py:1493, in Popen._execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_gid, unused_gids, unused_uid, unused_umask, unused_start_new_session, unused_process_group)
   1492 try:
-> 1493     hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
   1494                              # no special security
   1495                              None, None,
   1496                              int(not close_fds),
   1497                              creationflags,
   1498                              env,
   1499                              cwd,
   1500                              startupinfo)
   1501 finally:
   1502     # Child is launched. Close the parent's copy of those pipe
   1503     # handles that only the child should have open.  You need
   (...)
   1506     # pipe will not close when the child process exits and the
   1507     # ReadFile will hang.

OSError: [WinError 193] %1 is not a valid Win32 application

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
File :12

File c:\Python311\Lib\site-packages\textract\parsers\__init__.py:79, in process(filename, input_encoding, output_encoding, extension, **kwargs)
     76 # do the extraction
     78 parser = filetype_module.Parser()
---> 79 return parser.process(filename, input_encoding, output_encoding, **kwargs)

File c:\Python311\Lib\site-packages\textract\parsers\utils.py:46, in BaseParser.process(self, filename, input_encoding, output_encoding, **kwargs)
     36 """Process ``filename`` and encode byte-string with ``encoding``. This
     37 method is called by :func:`textract.parsers.process` and wraps
     38 the :meth:`.BaseParser.extract` method in `a delicious unicode
     39 sandwich `_.
     40 
     41 """
     42 # make a "unicode sandwich" to handle dealing with unknown
     43 # input byte strings and converting them to a predictable
     44 # output encoding
     45 # http://nedbatchelder.com/text/unipain/unipain.html#35
---> 46 byte_string = self.extract(filename, **kwargs)
     47 unicode_string = self.decode(byte_string, input_encoding)
     48 return self.encode(unicode_string, output_encoding)

File c:\Python311\Lib\site-packages\textract\parsers\pdf_parser.py:32, in Parser.extract(self, filename, method, **kwargs)
     29             raise ex
     31 elif method == 'pdfminer':
---> 32     return self.extract_pdfminer(filename, **kwargs)
     33 elif method == 'tesseract':
     34     return self.extract_tesseract(filename, **kwargs)

File c:\Python311\Lib\site-packages\textract\parsers\pdf_parser.py:57, in Parser.extract_pdfminer(self, filename, **kwargs)
     55 except OSError:
     56     try:
---> 57         stdout, _ = self.run(['python3',pdf2txt_path, filename])
     58     except ShellError:
     59         stdout, _ = self.run(['python2',pdf2txt_path, filename])

File c:\Python311\Lib\site-packages\textract\parsers\utils.py:87, in ShellParser.run(self, args)
     85 # run a subprocess and put the stdout and stderr on the pipe object
     86 try:
---> 87     pipe = subprocess.Popen(
     88         args,
     89         stdout=subprocess.PIPE, stderr=subprocess.PIPE,
     90     )
     91 except OSError as e:
     92     if e.errno == errno.ENOENT:
     93         # File not found.
     94         # This is equivalent to getting exitcode 127 from sh

File c:\Python311\Lib\subprocess.py:1024, in Popen.__init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, user, group, extra_groups, encoding, errors, text, umask, pipesize, process_group)
   1020         if self.text_mode:
   1021             self.stderr = io.TextIOWrapper(self.stderr,
   1022                     encoding=encoding, errors=errors)
-> 1024     self._execute_child(args, executable, preexec_fn, close_fds,
   1025                         pass_fds, cwd, env,
   1026                         startupinfo, creationflags, shell,
   1027                         p2cread, p2cwrite,
   1028                         c2pread, c2pwrite,
   1029                         errread, errwrite,
   1030                         restore_signals,
   1031                         gid, gids, uid, umask,
   1032                         start_new_session, process_group)
   1033 except:
   1034     # Cleanup if the child failed starting.
   1035     for f in filter(None, (self.stdin, self.stdout, self.stderr)):

File c:\Python311\Lib\subprocess.py:1433, in Popen._execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_gid, unused_gids, unused_uid, unused_umask, unused_start_new_session, unused_process_group)
   1431     args = list2cmdline([args])
   1432 else:
-> 1433     args = list2cmdline(args)
   1435 if executable is not None:
   1436     executable = os.fsdecode(executable)

File c:\Python311\Lib\subprocess.py:608, in list2cmdline(seq)
    606 result = []
    607 needquote = False
--> 608 for arg in map(os.fsdecode, seq):
    609     bs_buf = []
    611     # Add a space to separate this argument from the others

File :824, in fsdecode(filename)

TypeError: expected str, bytes or os.PathLike object, not NoneType

I've mostly tried experimenting with the input into textract.process(pdf_path, method='pdfminer')

The first thing I noticed is that the pdf_path had '.\' in front of the path, so I tried the following input textract.process('data\\' + pdf_file, method='pdfminer'), but ended up with the same error.

I also tried just passing in the pdf_file like so textract.process(pdf_file, method='pdfminer'), but got the following error:

---------------------------------------------------------------------------
MissingFileError                          Traceback (most recent call last)
File :12

File c:\Python311\Lib\site-packages\textract\parsers\__init__.py:41, in process(filename, input_encoding, output_encoding, extension, **kwargs)
     39 # make sure the filename exists
     40 if not os.path.exists(filename):
---> 41     raise exceptions.MissingFileError(filename)
     43 # get the filename extension, which is something like .docx for
     44 # example, and import the module dynamically using importlib. This
     45 # is a relative import so the name of the package is necessary
     46 # normally, file extension will be extracted from the file name
     47 # if the file name has no extension, then the user can pass the
     48 # extension as an argument
     49 if extension:

MissingFileError: The file "f1040.pdf" can not be found.
Is this the right path/to/file/you/want/to/extract.pdf?

I've also tried to add additional PDF files to the data folder to check to see if the issue had something to do with the fact that I was using one file, I've also tried changing the PDF file names to see if the issue had to do with the name of the file.

0

There are 0 answers