diff --git a/.gitignore b/.gitignore index d798e754..96dfe98b 100644 --- a/.gitignore +++ b/.gitignore @@ -54,4 +54,3 @@ dmypy.json # OSX Files .DS_Store - diff --git a/HISTORY.md b/HISTORY.md index 04e5abd9..2e672539 100755 --- a/HISTORY.md +++ b/HISTORY.md @@ -2,14 +2,13 @@ ## master -0.11.0 (2023-02-26) ------------------- +## 0.11.0 (2023-02-26) - Replace `PdfFileReader` with `PdfReader` and pin PyPDF to `>=3.0.0`. [#307](https://github.com/camelot-dev/camelot/pull/307) by [Martin Thoma](https://github.com/MartinThoma). + 0.10.1 (2021-07-11) -0.10.1 (2021-07-11) ------------------- +--- - Change extra requirements from `cv` to `base`. You can use `pip install "camelot-py[base]"` to install everything required to run camelot. diff --git a/README.md b/README.md index 6835ecd9..443a6929 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,6 @@ See [comparison with similar libraries and tools](https://github.com/py-pdf/pypd The easiest way to install pypdf_table_extraction is with [conda](https://conda.io/docs/), which is a package manager and environment management system for the [Anaconda](http://docs.continuum.io/anaconda/) distribution. - ```bash conda install -c conda-forge pypdf-table-extraction ``` diff --git a/camelot/backends/poppler_backend.py b/camelot/backends/poppler_backend.py index ac41d831..d3672586 100644 --- a/camelot/backends/poppler_backend.py +++ b/camelot/backends/poppler_backend.py @@ -1,9 +1,11 @@ import os -import sys import shutil import subprocess +import sys + + +path = os.path.dirname(sys.executable) + os.pathsep + os.environ["PATH"] -path = os.path.dirname(sys.executable) + os.pathsep + os.environ['PATH'] class PopplerBackend: def convert(self, pdf_path, png_path): diff --git a/camelot/cli.py b/camelot/cli.py index 8aad5eb4..8a9f0c92 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -290,13 +290,17 @@ def stream(c, *args, **kwargs): columns = list(kwargs["columns"]) kwargs["columns"] = None if not columns else columns - margins = conf.pop('margins') + margins = conf.pop("margins") if margins is None: layout_kwargs = {} else: - layout_kwargs = {"char_margin": margins[0], "line_margin": margins[1], "word_margin": margins[2]} - + layout_kwargs = { + "char_margin": margins[0], + "line_margin": margins[1], + "word_margin": margins[2], + } + if plot_type is not None: if not _HAS_MPL: raise ImportError("matplotlib is required for plotting.") @@ -307,7 +311,12 @@ def stream(c, *args, **kwargs): raise click.UsageError("Please specify output file format using --format") tables = read_pdf( - filepath, pages=pages, flavor="stream", suppress_stdout=quiet, layout_kwargs=layout_kwargs, **kwargs + filepath, + pages=pages, + flavor="stream", + suppress_stdout=quiet, + layout_kwargs=layout_kwargs, + **kwargs, ) click.echo(f"Found {tables.n} tables") if plot_type is not None: diff --git a/camelot/handlers.py b/camelot/handlers.py index 74ddde7a..12b3b3e1 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -149,7 +149,7 @@ def parse( suppress_stdout=False, parallel=False, layout_kwargs=None, - **kwargs + **kwargs, ): """Extracts tables by calling parser.get_tables on all single page PDFs. @@ -189,7 +189,8 @@ def parse( jobs = [] for p in self.pages: j = pool.apply_async( - self._parse_page,(p, tempdir, parser, suppress_stdout, layout_kwargs) + self._parse_page, + (p, tempdir, parser, suppress_stdout, layout_kwargs), ) jobs.append(j) @@ -198,14 +199,14 @@ def parse( tables.extend(t) else: for p in self.pages: - t = self._parse_page(p, tempdir, parser, suppress_stdout, layout_kwargs) + t = self._parse_page( + p, tempdir, parser, suppress_stdout, layout_kwargs + ) tables.extend(t) return TableList(sorted(tables)) - def _parse_page( - self, page, tempdir, parser, suppress_stdout, layout_kwargs - ): + def _parse_page(self, page, tempdir, parser, suppress_stdout, layout_kwargs): """Extracts tables by calling parser.get_tables on a single page PDF. @@ -224,7 +225,7 @@ def _parse_page( ------- tables : camelot.core.TableList List of tables found in PDF. - + """ self._save_page(self.filepath, page, tempdir) page_path = os.path.join(tempdir, f"page-{page}.pdf") diff --git a/camelot/utils.py b/camelot/utils.py index fda56f54..1fe790e6 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -81,7 +81,7 @@ def download_url(url): with tempfile.NamedTemporaryFile("wb", delete=False) as f: headers = { "User-Agent": "Mozilla/5.0", - "Accept-Encoding": "gzip;q=1.0, deflate;q=0.9, br;q=0.8, compress;q=0.7, *;q=0.1" + "Accept-Encoding": "gzip;q=1.0, deflate;q=0.9, br;q=0.8, compress;q=0.7, *;q=0.1", } request = Request(url, None, headers) obj = urlopen(request) @@ -588,10 +588,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""): ------- grouped_chars : list List of tuples of the form (idx, text) where idx is the index - of row/column and text is the an lttextline substring. + of row/column and text is the an LTTextLine substring. """ - idx = 0 cut_text = [] bbox = textline.bbox try: @@ -834,7 +833,6 @@ def compute_whitespace(d): """ whitespace = 0 - r_nempty_cells, c_nempty_cells = [], [] for i in d: for j in i: if j.strip() == "": diff --git a/docs/conf.py b/docs/conf.py index a9851dae..5ea87d88 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -188,7 +188,7 @@ html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -#html_sidebars = { } +# html_sidebars = { } # Additional templates that should be rendered to pages, maps page names to # template names. @@ -267,7 +267,13 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, "pypdf-table-extraction.tex", "pypdf-table-extraction Documentation", "Vinayak Mehta", "manual"), + ( + master_doc, + "pypdf-table-extraction.tex", + "pypdf-table-extraction Documentation", + "Vinayak Mehta", + "manual", + ), ] # The name of an image file (relative to this directory) to place at the top of @@ -307,7 +313,15 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [(master_doc, "pypdf_table_extraction", "pypdf_table_extraction Documentation", [author], 1)] +man_pages = [ + ( + master_doc, + "pypdf_table_extraction", + "pypdf_table_extraction Documentation", + [author], + 1, + ) +] # If true, show URL addresses after external links. # @@ -346,4 +360,3 @@ # If true, do not generate a @detailmenu in the "Top" node's menu. # # texinfo_no_detailmenu = False - diff --git a/docs/requirements.txt b/docs/requirements.txt index 9edd2d04..b4194739 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -5,7 +5,7 @@ myst_parser==4.0.0 ghostscript==0.7 opencv-python==4.10.0.84 matplotlib==3.9.2 -accessible-pygments==0.0.5 +accessible-pygments==0.0.5 pydata-sphinx-theme==0.15.4 sphinx-copybutton==0.5.2 sphinx-prompt==1.9.0 diff --git a/docs/user/cli.rst b/docs/user/cli.rst index fa415971..f25b84bd 100644 --- a/docs/user/cli.rst +++ b/docs/user/cli.rst @@ -5,7 +5,7 @@ Command-Line Interface pypdf_table_extraction comes with a command-line interface. -You can print the help for the interface by typing ``camelot --help`` in your favorite terminal program, as shown below. +You can print the help for the interface by typing ``camelot --help`` in your favorite terminal program, as shown below. Furthermore, you can print the help for each command by typing ``camelot --help``. Try it out! .. click:: camelot.cli:cli diff --git a/docs/user/quickstart.rst b/docs/user/quickstart.rst index 3f4c52d4..13ee8e31 100644 --- a/docs/user/quickstart.rst +++ b/docs/user/quickstart.rst @@ -118,11 +118,11 @@ pypdf_table_extraction supports extracting tables in parrallel using all the ava Here's how you can do the same with the :ref:`command-line interface `. .. code-block:: console - + $ camelot --pages all --parallel lattice foo.pdf .. note:: The reading of the PDF document is parallelized by processing pages by different CPU core. - Therefore, a document with a low page count could be slower to process in parallel. + Therefore, a document with a low page count could be slower to process in parallel. Reading encrypted PDFs ---------------------- diff --git a/noxfile.py b/noxfile.py index 8dcbcb8d..9cd15b72 100644 --- a/noxfile.py +++ b/noxfile.py @@ -1,4 +1,5 @@ """Nox sessions.""" + import os import shlex import shutil @@ -168,7 +169,12 @@ def tests(session: Session) -> None: session.install(".") session.install( - "coverage[toml]", "pytest", "pytest-mpl", "pygments", *base_requires, *plot_requires + "coverage[toml]", + "pytest", + "pytest-mpl", + "pygments", + *base_requires, + *plot_requires, ) try: session.run("coverage", "run", "--parallel", "-m", "pytest", *session.posargs) @@ -222,7 +228,14 @@ def docs_build(session: Session) -> None: session.install(".") session.install( - "sphinx", "sphinx-click", "sphinx-book-theme", "myst-parser", "sphinx-copybutton", "sphinx-prompt", *base_requires, *plot_requires + "sphinx", + "sphinx-click", + "sphinx-book-theme", + "myst-parser", + "sphinx-copybutton", + "sphinx-prompt", + *base_requires, + *plot_requires, ) build_dir = Path("docs", "_build") diff --git a/tests/test_cli.py b/tests/test_cli.py index 2357eae6..b9639465 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -62,13 +62,36 @@ def test_cli_stream(testdir): assert format_error in result.output result = runner.invoke( - cli, [ "--margins", "1.5", "0.5", "0.8", "--format", "csv", "--output", outfile, "stream", infile] + cli, + [ + "--margins", + "1.5", + "0.5", + "0.8", + "--format", + "csv", + "--output", + outfile, + "stream", + infile, + ], ) assert result.exit_code == 0 assert result.output == "Found 1 tables\n" result = runner.invoke( - cli, ["--margins", "1.5", "0.5", "--format", "csv", "--output", outfile, "stream", infile] + cli, + [ + "--margins", + "1.5", + "0.5", + "--format", + "csv", + "--output", + outfile, + "stream", + infile, + ], ) output_error = "Error: Invalid value for '-M' / '--margins': '--format' is not a valid float." assert output_error in result.output @@ -214,6 +237,7 @@ def test_cli_quiet(testdir): ) assert "No tables found on page-1" not in result.output + def test_cli_lattice_plot_type(): with TemporaryDirectory() as tempdir: runner = CliRunner() diff --git a/tests/test_errors.py b/tests/test_errors.py index 41262c13..3f2a0c7d 100644 --- a/tests/test_errors.py +++ b/tests/test_errors.py @@ -1,10 +1,10 @@ import os import warnings -from camelot.utils import is_url import pytest import camelot +from camelot.utils import is_url from tests.conftest import skip_on_windows @@ -145,7 +145,7 @@ def test_lattice_ghostscript_deprecation_warning(foo_pdf): def test_invalid_url(): - url = 'fttp://google.com/pdf' + url = "fttp://google.com/pdf" message = "File format not supported" with pytest.raises(Exception, match=message): url = camelot.read_pdf(url) diff --git a/tests/test_utils.py b/tests/test_utils.py index 9a68f386..dda1b866 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,11 +2,10 @@ import os from pdfminer.converter import PDFPageAggregator -from pdfminer.layout import ( - LAParams, - LTTextBoxHorizontal -) -from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager +from pdfminer.layout import LAParams +from pdfminer.layout import LTTextBoxHorizontal +from pdfminer.pdfinterp import PDFPageInterpreter +from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfpage import PDFPage from camelot.utils import bbox_intersection_area @@ -16,7 +15,7 @@ def get_text_from_pdf(filename): "Method to extract text object from pdf" # https://stackoverflow.com/questions/22898145/how-to-extract-text-and-text-coordinates-from-a-pdf-file # https://pdfminer-docs.readthedocs.io/programming.html#performing-layout-analysis - document = open(filename, 'rb') + document = open(filename, "rb") # Create resource manager rsrcmgr = PDFResourceManager() # Set parameters for analysis.