Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pre commit Fixes #148

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -54,4 +54,3 @@ dmypy.json

# OSX Files
.DS_Store

7 changes: 3 additions & 4 deletions HISTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,13 @@

## master

0.11.0 (2023-02-26)
------------------
## 0.11.0 (2023-02-26)

- Replace `PdfFileReader` with `PdfReader` and pin PyPDF to `>=3.0.0`. [#307](https://github.com/camelot-dev/camelot/pull/307) by [Martin Thoma](https://github.com/MartinThoma).

0.10.1 (2021-07-11)

0.10.1 (2021-07-11)
------------------
---

- Change extra requirements from `cv` to `base`. You can use `pip install "camelot-py[base]"` to install everything required to run camelot.

Expand Down
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ See [comparison with similar libraries and tools](https://github.com/py-pdf/pypd

The easiest way to install pypdf_table_extraction is with [conda](https://conda.io/docs/), which is a package manager and environment management system for the [Anaconda](http://docs.continuum.io/anaconda/) distribution.


```bash
conda install -c conda-forge pypdf-table-extraction
```
Expand Down
6 changes: 4 additions & 2 deletions camelot/backends/poppler_backend.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import os
import sys
import shutil
import subprocess
import sys


path = os.path.dirname(sys.executable) + os.pathsep + os.environ["PATH"]

path = os.path.dirname(sys.executable) + os.pathsep + os.environ['PATH']

class PopplerBackend:
def convert(self, pdf_path, png_path):
Expand Down
17 changes: 13 additions & 4 deletions camelot/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,13 +290,17 @@ def stream(c, *args, **kwargs):
columns = list(kwargs["columns"])
kwargs["columns"] = None if not columns else columns

margins = conf.pop('margins')
margins = conf.pop("margins")

if margins is None:
layout_kwargs = {}
else:
layout_kwargs = {"char_margin": margins[0], "line_margin": margins[1], "word_margin": margins[2]}

layout_kwargs = {
"char_margin": margins[0],
"line_margin": margins[1],
"word_margin": margins[2],
}

if plot_type is not None:
if not _HAS_MPL:
raise ImportError("matplotlib is required for plotting.")
Expand All @@ -307,7 +311,12 @@ def stream(c, *args, **kwargs):
raise click.UsageError("Please specify output file format using --format")

tables = read_pdf(
filepath, pages=pages, flavor="stream", suppress_stdout=quiet, layout_kwargs=layout_kwargs, **kwargs
filepath,
pages=pages,
flavor="stream",
suppress_stdout=quiet,
layout_kwargs=layout_kwargs,
**kwargs,
)
click.echo(f"Found {tables.n} tables")
if plot_type is not None:
Expand Down
15 changes: 8 additions & 7 deletions camelot/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ def parse(
suppress_stdout=False,
parallel=False,
layout_kwargs=None,
**kwargs
**kwargs,
):
"""Extracts tables by calling parser.get_tables on all single
page PDFs.
Expand Down Expand Up @@ -189,7 +189,8 @@ def parse(
jobs = []
for p in self.pages:
j = pool.apply_async(
self._parse_page,(p, tempdir, parser, suppress_stdout, layout_kwargs)
self._parse_page,
(p, tempdir, parser, suppress_stdout, layout_kwargs),
)
jobs.append(j)

Expand All @@ -198,14 +199,14 @@ def parse(
tables.extend(t)
else:
for p in self.pages:
t = self._parse_page(p, tempdir, parser, suppress_stdout, layout_kwargs)
t = self._parse_page(
p, tempdir, parser, suppress_stdout, layout_kwargs
)
tables.extend(t)

return TableList(sorted(tables))

def _parse_page(
self, page, tempdir, parser, suppress_stdout, layout_kwargs
):
def _parse_page(self, page, tempdir, parser, suppress_stdout, layout_kwargs):
"""Extracts tables by calling parser.get_tables on a single
page PDF.

Expand All @@ -224,7 +225,7 @@ def _parse_page(
-------
tables : camelot.core.TableList
List of tables found in PDF.

"""
self._save_page(self.filepath, page, tempdir)
page_path = os.path.join(tempdir, f"page-{page}.pdf")
Expand Down
6 changes: 2 additions & 4 deletions camelot/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def download_url(url):
with tempfile.NamedTemporaryFile("wb", delete=False) as f:
headers = {
"User-Agent": "Mozilla/5.0",
"Accept-Encoding": "gzip;q=1.0, deflate;q=0.9, br;q=0.8, compress;q=0.7, *;q=0.1"
"Accept-Encoding": "gzip;q=1.0, deflate;q=0.9, br;q=0.8, compress;q=0.7, *;q=0.1",
}
request = Request(url, None, headers)
obj = urlopen(request)
Expand Down Expand Up @@ -588,10 +588,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
-------
grouped_chars : list
List of tuples of the form (idx, text) where idx is the index
of row/column and text is the an lttextline substring.
of row/column and text is the an LTTextLine substring.

"""
idx = 0
cut_text = []
bbox = textline.bbox
try:
Expand Down Expand Up @@ -834,7 +833,6 @@ def compute_whitespace(d):

"""
whitespace = 0
r_nempty_cells, c_nempty_cells = [], []
for i in d:
for j in i:
if j.strip() == "":
Expand Down
21 changes: 17 additions & 4 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@
html_use_smartypants = True

# Custom sidebar templates, maps document names to template names.
#html_sidebars = { }
# html_sidebars = { }

# Additional templates that should be rendered to pages, maps page names to
# template names.
Expand Down Expand Up @@ -267,7 +267,13 @@
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
(master_doc, "pypdf-table-extraction.tex", "pypdf-table-extraction Documentation", "Vinayak Mehta", "manual"),
(
master_doc,
"pypdf-table-extraction.tex",
"pypdf-table-extraction Documentation",
"Vinayak Mehta",
"manual",
),
]

# The name of an image file (relative to this directory) to place at the top of
Expand Down Expand Up @@ -307,7 +313,15 @@

# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [(master_doc, "pypdf_table_extraction", "pypdf_table_extraction Documentation", [author], 1)]
man_pages = [
(
master_doc,
"pypdf_table_extraction",
"pypdf_table_extraction Documentation",
[author],
1,
)
]

# If true, show URL addresses after external links.
#
Expand Down Expand Up @@ -346,4 +360,3 @@
# If true, do not generate a @detailmenu in the "Top" node's menu.
#
# texinfo_no_detailmenu = False

2 changes: 1 addition & 1 deletion docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ myst_parser==4.0.0
ghostscript==0.7
opencv-python==4.10.0.84
matplotlib==3.9.2
accessible-pygments==0.0.5
accessible-pygments==0.0.5
pydata-sphinx-theme==0.15.4
sphinx-copybutton==0.5.2
sphinx-prompt==1.9.0
2 changes: 1 addition & 1 deletion docs/user/cli.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Command-Line Interface

pypdf_table_extraction comes with a command-line interface.

You can print the help for the interface by typing ``camelot --help`` in your favorite terminal program, as shown below.
You can print the help for the interface by typing ``camelot --help`` in your favorite terminal program, as shown below.
Furthermore, you can print the help for each command by typing ``camelot <command> --help``. Try it out!

.. click:: camelot.cli:cli
Expand Down
4 changes: 2 additions & 2 deletions docs/user/quickstart.rst
Original file line number Diff line number Diff line change
Expand Up @@ -118,11 +118,11 @@ pypdf_table_extraction supports extracting tables in parrallel using all the ava
Here's how you can do the same with the :ref:`command-line interface <cli>`.

.. code-block:: console

$ camelot --pages all --parallel lattice foo.pdf

.. note:: The reading of the PDF document is parallelized by processing pages by different CPU core.
Therefore, a document with a low page count could be slower to process in parallel.
Therefore, a document with a low page count could be slower to process in parallel.

Reading encrypted PDFs
----------------------
Expand Down
17 changes: 15 additions & 2 deletions noxfile.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Nox sessions."""

import os
import shlex
import shutil
Expand Down Expand Up @@ -168,7 +169,12 @@ def tests(session: Session) -> None:
session.install(".")

session.install(
"coverage[toml]", "pytest", "pytest-mpl", "pygments", *base_requires, *plot_requires
"coverage[toml]",
"pytest",
"pytest-mpl",
"pygments",
*base_requires,
*plot_requires,
)
try:
session.run("coverage", "run", "--parallel", "-m", "pytest", *session.posargs)
Expand Down Expand Up @@ -222,7 +228,14 @@ def docs_build(session: Session) -> None:

session.install(".")
session.install(
"sphinx", "sphinx-click", "sphinx-book-theme", "myst-parser", "sphinx-copybutton", "sphinx-prompt", *base_requires, *plot_requires
"sphinx",
"sphinx-click",
"sphinx-book-theme",
"myst-parser",
"sphinx-copybutton",
"sphinx-prompt",
*base_requires,
*plot_requires,
)

build_dir = Path("docs", "_build")
Expand Down
28 changes: 26 additions & 2 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,36 @@ def test_cli_stream(testdir):
assert format_error in result.output

result = runner.invoke(
cli, [ "--margins", "1.5", "0.5", "0.8", "--format", "csv", "--output", outfile, "stream", infile]
cli,
[
"--margins",
"1.5",
"0.5",
"0.8",
"--format",
"csv",
"--output",
outfile,
"stream",
infile,
],
)
assert result.exit_code == 0
assert result.output == "Found 1 tables\n"

result = runner.invoke(
cli, ["--margins", "1.5", "0.5", "--format", "csv", "--output", outfile, "stream", infile]
cli,
[
"--margins",
"1.5",
"0.5",
"--format",
"csv",
"--output",
outfile,
"stream",
infile,
],
)
output_error = "Error: Invalid value for '-M' / '--margins': '--format' is not a valid float."
assert output_error in result.output
Expand Down Expand Up @@ -214,6 +237,7 @@ def test_cli_quiet(testdir):
)
assert "No tables found on page-1" not in result.output


def test_cli_lattice_plot_type():
with TemporaryDirectory() as tempdir:
runner = CliRunner()
Expand Down
4 changes: 2 additions & 2 deletions tests/test_errors.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import os
import warnings
from camelot.utils import is_url

import pytest

import camelot
from camelot.utils import is_url
from tests.conftest import skip_on_windows


Expand Down Expand Up @@ -145,7 +145,7 @@ def test_lattice_ghostscript_deprecation_warning(foo_pdf):


def test_invalid_url():
url = 'fttp://google.com/pdf'
url = "fttp://google.com/pdf"
message = "File format not supported"
with pytest.raises(Exception, match=message):
url = camelot.read_pdf(url)
Expand Down
11 changes: 5 additions & 6 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,10 @@
import os

from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import (
LAParams,
LTTextBoxHorizontal
)
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.layout import LAParams
from pdfminer.layout import LTTextBoxHorizontal
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage

from camelot.utils import bbox_intersection_area
Expand All @@ -16,7 +15,7 @@ def get_text_from_pdf(filename):
"Method to extract text object from pdf"
# https://stackoverflow.com/questions/22898145/how-to-extract-text-and-text-coordinates-from-a-pdf-file
# https://pdfminer-docs.readthedocs.io/programming.html#performing-layout-analysis
document = open(filename, 'rb')
document = open(filename, "rb")
# Create resource manager
rsrcmgr = PDFResourceManager()
# Set parameters for analysis.
Expand Down