freelawproject · flooie · Jun 14, 2024 · Jun 14, 2024 · Jun 14, 2024 · Jun 14, 2024
diff --git a/doctor/lib/text_extraction.py b/doctor/lib/text_extraction.py
@@ -43,9 +43,9 @@ def get_page_text(page: pdfplumber.PDF.pages, strip_margin: bool) -> str:
     :param strip_margin: a flag to crop out the margin of a document and skewed content
     :return: Text from the pdf plumber page
     """
-    if strip_margin:
+    _, _, width, height = page.bbox
+    if strip_margin and (height > width):
         # Crop margins and remove skewed text
-        _, _, width, height = page.bbox
         pixels_per_inch = width / 8.5
         bbox = (
             0,
@@ -57,7 +57,10 @@ def get_page_text(page: pdfplumber.PDF.pages, strip_margin: bool) -> str:
             page.crop(bbox)
             .filter(is_skewed)
             .extract_text(
-                layout=True, keep_blank_chars=True, y_tolerance=5, y_density=25
+                layout=True,
+                keep_blank_chars=True,
+                y_tolerance=5,
+                y_density=25,
             )
         )
     else:

diff --git a/doctor/test_assets/recap_extract/gov.uscourts.azd.1085839.3.0.pdf b/doctor/test_assets/recap_extract/gov.uscourts.azd.1085839.3.0.pdf
diff --git a/doctor/tests.py b/doctor/tests.py
@@ -74,6 +74,26 @@ def test_recap_extraction_with_strip_margin(self):
             msg="Wrong Text",
         )
 
+    def test_recap_strip_marign_with_multiple_shaped_pdfs(self):
+        """Can we extract atypical shape pdf with strip margin?"""
+
+        files = make_file(
+            filename="recap_extract/gov.uscourts.azd.1085839.3.0.pdf"
+        )
+        params = {"strip_margin": True}
+        response = requests.post(
+            "http://doctor:5050/extract/recap/text/",
+            files=files,
+            params=params,
+        )
+        first_line = response.json()["content"].splitlines()[0].strip()
+        self.assertEqual(200, response.status_code, msg="Wrong status code")
+        self.assertEqual(
+            "1   WO",
+            first_line,
+            msg="Wrong Text",
+        )
+
     def test_strip_margin_without_ocr(self):
         """Can we extract from the new recap text endpoint with strip margin?"""
         files = make_file(