Skip to content

Commit

Permalink
feat(text-extraction): Remove try except for simple logic
Browse files Browse the repository at this point in the history
  • Loading branch information
flooie committed Jun 14, 2024
1 parent ec91bf8 commit 1045c19
Showing 1 changed file with 11 additions and 18 deletions.
29 changes: 11 additions & 18 deletions doctor/lib/text_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,33 +43,26 @@ def get_page_text(page: pdfplumber.PDF.pages, strip_margin: bool) -> str:
:param strip_margin: a flag to crop out the margin of a document and skewed content
:return: Text from the pdf plumber page
"""
if strip_margin:
_, _, width, height = page.bbox
if strip_margin and (height > width):
# Crop margins and remove skewed text
_, _, width, height = page.bbox
pixels_per_inch = width / 8.5
bbox = (
0,
pixels_per_inch * 1, # 1 inch down from top
width, #
pixels_per_inch * 10, # 10 inches from top (1 inch from bottom)
)
try:
page_text = (
page.crop(bbox)
.filter(is_skewed)
.extract_text(
layout=True,
keep_blank_chars=True,
y_tolerance=5,
y_density=25,
)
)
except ValueError:
# If bounding box is non standard we do not want to apply strip margin
page_text = page.extract_text(
layout=True, keep_blank_chars=True, y_tolerance=5, y_density=25
page_text = (
page.crop(bbox)
.filter(is_skewed)
.extract_text(
layout=True,
keep_blank_chars=True,
y_tolerance=5,
y_density=25,
)

)
else:
page_text = page.extract_text(
layout=True, keep_blank_chars=True, y_tolerance=5, y_density=25
Expand Down

0 comments on commit 1045c19

Please sign in to comment.