diff --git a/doctor/lib/text_extraction.py b/doctor/lib/text_extraction.py index eea01ee..f3b5810 100644 --- a/doctor/lib/text_extraction.py +++ b/doctor/lib/text_extraction.py @@ -43,9 +43,9 @@ def get_page_text(page: pdfplumber.PDF.pages, strip_margin: bool) -> str: :param strip_margin: a flag to crop out the margin of a document and skewed content :return: Text from the pdf plumber page """ - if strip_margin: + _, _, width, height = page.bbox + if strip_margin and (height > width): # Crop margins and remove skewed text - _, _, width, height = page.bbox pixels_per_inch = width / 8.5 bbox = ( 0, @@ -57,7 +57,10 @@ def get_page_text(page: pdfplumber.PDF.pages, strip_margin: bool) -> str: page.crop(bbox) .filter(is_skewed) .extract_text( - layout=True, keep_blank_chars=True, y_tolerance=5, y_density=25 + layout=True, + keep_blank_chars=True, + y_tolerance=5, + y_density=25, ) ) else: diff --git a/doctor/test_assets/recap_extract/gov.uscourts.azd.1085839.3.0.pdf b/doctor/test_assets/recap_extract/gov.uscourts.azd.1085839.3.0.pdf new file mode 100644 index 0000000..410da34 Binary files /dev/null and b/doctor/test_assets/recap_extract/gov.uscourts.azd.1085839.3.0.pdf differ diff --git a/doctor/tests.py b/doctor/tests.py index f285ec0..ddca107 100644 --- a/doctor/tests.py +++ b/doctor/tests.py @@ -74,6 +74,26 @@ def test_recap_extraction_with_strip_margin(self): msg="Wrong Text", ) + def test_recap_strip_marign_with_multiple_shaped_pdfs(self): + """Can we extract atypical shape pdf with strip margin?""" + + files = make_file( + filename="recap_extract/gov.uscourts.azd.1085839.3.0.pdf" + ) + params = {"strip_margin": True} + response = requests.post( + "http://doctor:5050/extract/recap/text/", + files=files, + params=params, + ) + first_line = response.json()["content"].splitlines()[0].strip() + self.assertEqual(200, response.status_code, msg="Wrong status code") + self.assertEqual( + "1 WO", + first_line, + msg="Wrong Text", + ) + def test_strip_margin_without_ocr(self): """Can we extract from the new recap text endpoint with strip margin?""" files = make_file(