py-pdf · bosd · Sep 29, 2024 · Sep 29, 2024 · Sep 29, 2024
diff --git a/.gitignore b/.gitignore
@@ -54,4 +54,3 @@ dmypy.json
 
 # OSX Files
 .DS_Store
-
diff --git a/HISTORY.md b/HISTORY.md
@@ -2,14 +2,13 @@
 
 ## master
 
-0.11.0 (2023-02-26)
-------------------
+## 0.11.0 (2023-02-26)
 
 - Replace `PdfFileReader` with `PdfReader` and pin PyPDF to `>=3.0.0`. [#307](https://github.com/camelot-dev/camelot/pull/307) by [Martin Thoma](https://github.com/MartinThoma).
 
+  0.10.1 (2021-07-11)
 
-0.10.1 (2021-07-11)
-------------------
+---
 
 - Change extra requirements from `cv` to `base`. You can use `pip install "camelot-py[base]"` to install everything required to run camelot.
 

diff --git a/README.md b/README.md
@@ -64,7 +64,6 @@ See [comparison with similar libraries and tools](https://github.com/py-pdf/pypd
 
 The easiest way to install pypdf_table_extraction is with [conda](https://conda.io/docs/), which is a package manager and environment management system for the [Anaconda](http://docs.continuum.io/anaconda/) distribution.
 
-
 ```bash
 conda install -c conda-forge pypdf-table-extraction
 ```

diff --git a/camelot/backends/poppler_backend.py b/camelot/backends/poppler_backend.py
@@ -1,9 +1,11 @@
 import os
-import sys
 import shutil
 import subprocess
+import sys
+
+
+path = os.path.dirname(sys.executable) + os.pathsep + os.environ["PATH"]
 
-path = os.path.dirname(sys.executable) + os.pathsep + os.environ['PATH']
 
 class PopplerBackend:
     def convert(self, pdf_path, png_path):

diff --git a/camelot/cli.py b/camelot/cli.py
@@ -290,13 +290,17 @@ def stream(c, *args, **kwargs):
     columns = list(kwargs["columns"])
     kwargs["columns"] = None if not columns else columns
 
-    margins = conf.pop('margins')
+    margins = conf.pop("margins")
 
     if margins is None:
         layout_kwargs = {}
     else:
-        layout_kwargs = {"char_margin": margins[0], "line_margin": margins[1], "word_margin": margins[2]}
-
+        layout_kwargs = {
+            "char_margin": margins[0],
+            "line_margin": margins[1],
+            "word_margin": margins[2],
+        }
+
     if plot_type is not None:
         if not _HAS_MPL:
             raise ImportError("matplotlib is required for plotting.")
@@ -307,7 +311,12 @@ def stream(c, *args, **kwargs):
             raise click.UsageError("Please specify output file format using --format")
 
     tables = read_pdf(
-        filepath, pages=pages, flavor="stream", suppress_stdout=quiet, layout_kwargs=layout_kwargs, **kwargs
+        filepath,
+        pages=pages,
+        flavor="stream",
+        suppress_stdout=quiet,
+        layout_kwargs=layout_kwargs,
+        **kwargs,
     )
     click.echo(f"Found {tables.n} tables")
     if plot_type is not None:

diff --git a/camelot/handlers.py b/camelot/handlers.py
@@ -149,7 +149,7 @@ def parse(
         suppress_stdout=False,
         parallel=False,
         layout_kwargs=None,
-        **kwargs
+        **kwargs,
     ):
         """Extracts tables by calling parser.get_tables on all single
         page PDFs.
@@ -189,7 +189,8 @@ def parse(
                     jobs = []
                     for p in self.pages:
                         j = pool.apply_async(
-                            self._parse_page,(p, tempdir, parser, suppress_stdout, layout_kwargs)
+                            self._parse_page,
+                            (p, tempdir, parser, suppress_stdout, layout_kwargs),
                         )
                         jobs.append(j)
 
@@ -198,14 +199,14 @@ def parse(
                         tables.extend(t)
             else:
                 for p in self.pages:
-                    t = self._parse_page(p, tempdir, parser, suppress_stdout, layout_kwargs)
+                    t = self._parse_page(
+                        p, tempdir, parser, suppress_stdout, layout_kwargs
+                    )
                     tables.extend(t)
 
         return TableList(sorted(tables))
 
-    def _parse_page(
-        self, page, tempdir, parser, suppress_stdout, layout_kwargs
-    ):
+    def _parse_page(self, page, tempdir, parser, suppress_stdout, layout_kwargs):
         """Extracts tables by calling parser.get_tables on a single
         page PDF.
 
@@ -224,7 +225,7 @@ def _parse_page(
         -------
         tables : camelot.core.TableList
             List of tables found in PDF.
-        
+
         """
         self._save_page(self.filepath, page, tempdir)
         page_path = os.path.join(tempdir, f"page-{page}.pdf")

diff --git a/camelot/utils.py b/camelot/utils.py
@@ -81,7 +81,7 @@ def download_url(url):
     with tempfile.NamedTemporaryFile("wb", delete=False) as f:
         headers = {
             "User-Agent": "Mozilla/5.0",
-            "Accept-Encoding": "gzip;q=1.0, deflate;q=0.9, br;q=0.8, compress;q=0.7, *;q=0.1"
+            "Accept-Encoding": "gzip;q=1.0, deflate;q=0.9, br;q=0.8, compress;q=0.7, *;q=0.1",
         }
         request = Request(url, None, headers)
         obj = urlopen(request)
@@ -588,10 +588,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
     -------
     grouped_chars : list
         List of tuples of the form (idx, text) where idx is the index
-        of row/column and text is the an lttextline substring.
+        of row/column and text is the an LTTextLine substring.
 
     """
-    idx = 0
     cut_text = []
     bbox = textline.bbox
     try:
@@ -834,7 +833,6 @@ def compute_whitespace(d):
 
     """
     whitespace = 0
-    r_nempty_cells, c_nempty_cells = [], []
     for i in d:
         for j in i:
             if j.strip() == "":

diff --git a/docs/conf.py b/docs/conf.py
@@ -188,7 +188,7 @@
 html_use_smartypants = True
 
 # Custom sidebar templates, maps document names to template names.
-#html_sidebars = { }
+# html_sidebars = { }
 
 # Additional templates that should be rendered to pages, maps page names to
 # template names.
@@ -267,7 +267,13 @@
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-    (master_doc, "pypdf-table-extraction.tex", "pypdf-table-extraction Documentation", "Vinayak Mehta", "manual"),
+    (
+        master_doc,
+        "pypdf-table-extraction.tex",
+        "pypdf-table-extraction Documentation",
+        "Vinayak Mehta",
+        "manual",
+    ),
 ]
 
 # The name of an image file (relative to this directory) to place at the top of
@@ -307,7 +313,15 @@
 
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
-man_pages = [(master_doc, "pypdf_table_extraction", "pypdf_table_extraction Documentation", [author], 1)]
+man_pages = [
+    (
+        master_doc,
+        "pypdf_table_extraction",
+        "pypdf_table_extraction Documentation",
+        [author],
+        1,
+    )
+]
 
 # If true, show URL addresses after external links.
 #
@@ -346,4 +360,3 @@
 # If true, do not generate a @detailmenu in the "Top" node's menu.
 #
 # texinfo_no_detailmenu = False
-
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -5,7 +5,7 @@ myst_parser==4.0.0
 ghostscript==0.7
 opencv-python==4.10.0.84
 matplotlib==3.9.2
-accessible-pygments==0.0.5 
+accessible-pygments==0.0.5
 pydata-sphinx-theme==0.15.4
 sphinx-copybutton==0.5.2
 sphinx-prompt==1.9.0
diff --git a/docs/user/cli.rst b/docs/user/cli.rst
@@ -5,7 +5,7 @@ Command-Line Interface
 
 pypdf_table_extraction comes with a command-line interface.
 
-You can print the help for the interface by typing ``camelot --help`` in your favorite terminal program, as shown below. 
+You can print the help for the interface by typing ``camelot --help`` in your favorite terminal program, as shown below.
 Furthermore, you can print the help for each command by typing ``camelot <command> --help``. Try it out!
 
 .. click:: camelot.cli:cli

diff --git a/docs/user/quickstart.rst b/docs/user/quickstart.rst
@@ -118,11 +118,11 @@ pypdf_table_extraction supports extracting tables in parrallel using all the ava
     Here's how you can do the same with the :ref:`command-line interface <cli>`.
 
     .. code-block:: console
-    
+
         $ camelot --pages all --parallel lattice foo.pdf
 
 .. note:: The reading of the PDF document is parallelized by processing pages by different CPU core.
-    Therefore, a document with a low page count could be slower to process in parallel.  
+    Therefore, a document with a low page count could be slower to process in parallel.
 
 Reading encrypted PDFs
 ----------------------

diff --git a/noxfile.py b/noxfile.py
@@ -1,4 +1,5 @@
 """Nox sessions."""
+
 import os
 import shlex
 import shutil
@@ -168,7 +169,12 @@ def tests(session: Session) -> None:
     session.install(".")
 
     session.install(
-        "coverage[toml]", "pytest", "pytest-mpl", "pygments", *base_requires, *plot_requires
+        "coverage[toml]",
+        "pytest",
+        "pytest-mpl",
+        "pygments",
+        *base_requires,
+        *plot_requires,
     )
     try:
         session.run("coverage", "run", "--parallel", "-m", "pytest", *session.posargs)
@@ -222,7 +228,14 @@ def docs_build(session: Session) -> None:
 
     session.install(".")
     session.install(
-        "sphinx", "sphinx-click", "sphinx-book-theme", "myst-parser", "sphinx-copybutton", "sphinx-prompt", *base_requires, *plot_requires
+        "sphinx",
+        "sphinx-click",
+        "sphinx-book-theme",
+        "myst-parser",
+        "sphinx-copybutton",
+        "sphinx-prompt",
+        *base_requires,
+        *plot_requires,
     )
 
     build_dir = Path("docs", "_build")

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -62,13 +62,36 @@ def test_cli_stream(testdir):
         assert format_error in result.output
 
         result = runner.invoke(
-            cli, [ "--margins", "1.5", "0.5", "0.8", "--format", "csv", "--output", outfile, "stream", infile]
+            cli,
+            [
+                "--margins",
+                "1.5",
+                "0.5",
+                "0.8",
+                "--format",
+                "csv",
+                "--output",
+                outfile,
+                "stream",
+                infile,
+            ],
         )
         assert result.exit_code == 0
         assert result.output == "Found 1 tables\n"
 
         result = runner.invoke(
-            cli, ["--margins", "1.5", "0.5", "--format", "csv", "--output", outfile, "stream", infile]
+            cli,
+            [
+                "--margins",
+                "1.5",
+                "0.5",
+                "--format",
+                "csv",
+                "--output",
+                outfile,
+                "stream",
+                infile,
+            ],
         )
         output_error = "Error: Invalid value for '-M' / '--margins': '--format' is not a valid float."
         assert output_error in result.output
@@ -214,6 +237,7 @@ def test_cli_quiet(testdir):
         )
         assert "No tables found on page-1" not in result.output
 
+
 def test_cli_lattice_plot_type():
     with TemporaryDirectory() as tempdir:
         runner = CliRunner()

diff --git a/tests/test_errors.py b/tests/test_errors.py
@@ -1,10 +1,10 @@
 import os
 import warnings
-from camelot.utils import is_url
 
 import pytest
 
 import camelot
+from camelot.utils import is_url
 from tests.conftest import skip_on_windows
 
 
@@ -145,7 +145,7 @@ def test_lattice_ghostscript_deprecation_warning(foo_pdf):
 
 
 def test_invalid_url():
-    url = 'fttp://google.com/pdf'
+    url = "fttp://google.com/pdf"
     message = "File format not supported"
     with pytest.raises(Exception, match=message):
         url = camelot.read_pdf(url)

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -2,11 +2,10 @@
 import os
 
 from pdfminer.converter import PDFPageAggregator
-from pdfminer.layout import (
-    LAParams,
-    LTTextBoxHorizontal
-)
-from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
+from pdfminer.layout import LAParams
+from pdfminer.layout import LTTextBoxHorizontal
+from pdfminer.pdfinterp import PDFPageInterpreter
+from pdfminer.pdfinterp import PDFResourceManager
 from pdfminer.pdfpage import PDFPage
 
 from camelot.utils import bbox_intersection_area
@@ -16,7 +15,7 @@ def get_text_from_pdf(filename):
     "Method to extract text object from pdf"
     # https://stackoverflow.com/questions/22898145/how-to-extract-text-and-text-coordinates-from-a-pdf-file
     # https://pdfminer-docs.readthedocs.io/programming.html#performing-layout-analysis
-    document = open(filename, 'rb')
+    document = open(filename, "rb")
     # Create resource manager
     rsrcmgr = PDFResourceManager()
     # Set parameters for analysis.