Tabula UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 4: invalid continuation byte even when not specifying encoding

108 views Asked by At

I'm using the tabula package to try to read this pdf: https://www.conass.org.br/wp-content/uploads/2022/01/RENAME-2022.pdf

import numpy as np
import pandas as pd
import tabula
import os
import jpype #!pip install jpype1

path = 'RENAME-2022.pdf'
tabula.read_pdf(input_path = path, 
                output_format = "dataframe",
                pages = '28-78',
                encoding = 'latin-1')

I tried to run this with different encodings (including none), as 'ISO-8859-1, 'cp1252' ,'latin-1'and "utf-8". But they all return the same error:

UnicodeDecodeError                        Traceback (most recent call last)
Cell In[12], line 3
      1 path = 'RENAME-2022.pdf'
      2 initial_page = 28
----> 3 tabula.read_pdf(input_path = path, 
      4                 output_format = "dataframe",
      5                 pages = '24-78',
      6                 encoding = 'ISO-8859-1')

File ~\anaconda3\Lib\site-packages\tabula\io.py:395, in read_pdf(input_path, output_format, encoding, java_options, pandas_options, multiple_tables, user_agent, use_raw_url, pages, guess, area, relative_area, lattice, stream, password, silent, columns, relative_columns, format, batch, output_path, force_subprocess, options)
    392     raise ValueError(f"{path} is empty. Check the file, or download it manually.")
    394 try:
--> 395     output = _run(
    396         tabula_options,
    397         java_options,
    398         path,
    399         encoding=encoding,
    400         force_subprocess=force_subprocess,
    401     )
    402 finally:
    403     if temporary:

File ~\anaconda3\Lib\site-packages\tabula\io.py:82, in _run(options, java_options, path, encoding, force_subprocess)
     79 elif set(java_options) - IGNORED_JAVA_OPTIONS:
     80     logger.warning("java_options is ignored until rebooting the Python process.")
---> 82 return _tabula_vm.call_tabula_java(options, path)

File ~\anaconda3\Lib\site-packages\tabula\backend.py:117, in SubprocessTabula.call_tabula_java(self, options, path)
    115     if result.stderr:
    116         logger.warning(f"Got stderr: {result.stderr.decode(self.encoding)}")
--> 117     return result.stdout.decode(self.encoding)
    118 except FileNotFoundError:
    119     raise JavaNotFoundError(JAVA_NOT_FOUND_ERROR)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 4: invalid continuation byte

I've searched for this error, but I could find none related specifically to the tabula package. Most of the suggestions were to use encoding 'latin-1', which also didn't work for me. I tried to adapt this solution ('utf-8' codec can't decode byte 0xe2 : invalid continuation byte error) to my case, but it also didn't work:

with open(path, 'rb') as fopen:
        q = fopen.read()
        df = tabula.read_pdf(input_path = q, 
                output_format = "dataframe",
                pages = '24-78',
                encoding = 'ISO-8859-1')

Returning:

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[15], line 4
      2 with open(path, 'rb') as fopen:
      3         q = fopen.read()
----> 4         df = tabula.read_pdf(input_path = q, 
      5                 output_format = "dataframe",
      6                 pages = '24-78',
      7                 encoding = 'ISO-8859-1')

File ~\anaconda3\Lib\site-packages\tabula\io.py:389, in read_pdf(input_path, output_format, encoding, java_options, pandas_options, multiple_tables, user_agent, use_raw_url, pages, guess, area, relative_area, lattice, stream, password, silent, columns, relative_columns, format, batch, output_path, force_subprocess, options)
    386 path, temporary = localize_file(input_path, user_agent, use_raw_url=use_raw_url)
    388 if not os.path.exists(path):
--> 389     raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), path)
    391 if os.path.getsize(path) == 0:
    392     raise ValueError(f"{path} is empty. Check the file, or download it manually.")

I would appreciate any help. Thanks in advance.

0

There are 0 answers