Skip to content

Commit

Permalink
Fix error while handling table with missing edges
Browse files Browse the repository at this point in the history
Signed-off-by: Shawn <[email protected]>
  • Loading branch information
Shawn committed Apr 22, 2021
1 parent bdce628 commit 0c4c7f2
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 5 deletions.
4 changes: 3 additions & 1 deletion pdfplumber/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,12 +357,14 @@ def get_centroids(indices):
yield (vertices[index - 1] + vertices[index]) / 2

def get_cell_by_centroid(centroid_x, centroid_y):
return next(
cells = list(
filter(
lambda c: c[0] < centroid_x < c[2] and c[1] < centroid_y < c[3],
self.cells,
)
)
if cells:
return cells[0]

x_centroids = list(get_centroids((0, 2)))
y_centroids = list(get_centroids((1, 3)))
Expand Down
Binary file modified tests/pdfs/issue-420-example.pdf
Binary file not shown.
13 changes: 9 additions & 4 deletions tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,13 +102,13 @@ def test_merge_cell(self):
See issue #420
"""
path = os.path.join(HERE, "pdfs/issue-420-example.pdf")
TABLE_0 = [
TABLE_0_0 = [
["Header 1", "Header 2", "Header 3", "Header 4"],
["Merged cell 1", "Merged cell 2", "Cell 3-1", "Cell 4-1"],
["Merged cell 1", "Merged cell 2", "Cell 3-2", "Cell 4-2"],
["Merged cell 1", "Merged cell 2", "Cell 3-2", "Cell 4-3"],
]
TABLE_1 = [
TABLE_0_1 = [
["1-1", "1-1", "1-3", "1-4", "1-5", "1-6"],
["2-1", "2-2", "2-3", "2-4", "1-5", "1-6"],
["3-1", "3-2", "3-3", "3-4", "1-5", "1-6"],
Expand All @@ -118,9 +118,14 @@ def test_merge_cell(self):
["7-1", "7-1", "7-1", "7-1", "1-5", "1-6"],
["7-1", "7-1", "7-1", "7-1", "7-5", "1-6"],
]
TABLE_1_0 = [["1", "2"], [None, "4"]]

with pdfplumber.open(path) as pdf:
tables = pdf.pages[0].extract_tables()
assert len(tables) == 2
assert TABLE_0 == tables[0]
assert TABLE_1 == tables[1]
assert TABLE_0_0 == tables[0]
assert TABLE_0_1 == tables[1]

tables_missing_edges = pdf.pages[1].extract_tables()
assert len(tables_missing_edges) == 1
assert TABLE_1_0 == tables_missing_edges[0]

0 comments on commit 0c4c7f2

Please sign in to comment.