tests(extraction): Add and fix tests

Added and fixed tests Modified one test pdf to better reflect the test
freelawproject · Apr 29, 2024 · 0260927 · 0260927
1 parent a91bc95
commit 0260927
Show file tree

Hide file tree

Showing 8 changed files with 135 additions and 16 deletions.
diff --git a/doctor/test_assets/recap_issues/gov.uscourts.cand.16711.199.0.pdf b/doctor/test_assets/recap_issues/gov.uscourts.cand.16711.199.0.pdf
diff --git a/doctor/test_assets/recap_issues/gov.uscourts.cand.16711.581.0.pdf b/doctor/test_assets/recap_issues/gov.uscourts.cand.16711.581.0.pdf
diff --git a/doctor/test_assets/recap_issues/gov.uscourts.cand.203343.17.0.pdf b/doctor/test_assets/recap_issues/gov.uscourts.cand.203343.17.0.pdf
diff --git a/doctor/test_assets/recap_issues/gov.uscourts.njd.387907.32.0.pdf b/doctor/test_assets/recap_issues/gov.uscourts.njd.387907.32.0.pdf
diff --git a/doctor/test_assets/recap_issues/gov.uscourts.nysd.413741.11.0.pdf b/doctor/test_assets/recap_issues/gov.uscourts.nysd.413741.11.0.pdf
diff --git a/doctor/test_assets/recap_issues/gov.uscourts.nysd.413994.212.0.pdf b/doctor/test_assets/recap_issues/gov.uscourts.nysd.413994.212.0.pdf
diff --git a/doctor/test_assets/vector-pdf.pdf b/doctor/test_assets/vector-pdf.pdf
diff --git a/doctor/tests.py b/doctor/tests.py
@@ -32,37 +32,32 @@ def test_pdf_to_text(self):
         response = requests.post(
             "http://doctor:5050/extract/doc/text/", files=files, data=data
         )
-        text = response.json()["content"][:100].replace("\n", "").strip()
+        text = response.json()["content"].strip()[:200]
         self.assertEqual(200, response.status_code, msg="Wrong status code")
-        self.assertEqual(
-            text,
-            "(Slip Opinion)              OCTOBER TERM, 2012                                       1",
-            msg=text,
-        )
+        self.assertIn("(Slip Opinion)", text, msg="Text not found")
 
     def test_content_extraction(self):
-        """"""
+        """Test if we can extract text from a PDF"""
+
         files = make_file(filename="vector-pdf.pdf")
         data = {"ocr_available": False}
         response = requests.post(
             "http://doctor:5050/extract/doc/text/", files=files, data=data
         )
+        doc_content = response.json()['content']
         self.assertTrue(response.ok, msg="Content extraction failed")
-        self.assertEqual(
-            response.json()["content"][:100].replace("\n", "").strip(),
-            "(Slip Opinion)              OCTOBER TERM, 2012                                       1",
-            msg="Failed to extract content from .pdf file",
-        )
+        self.assertIn("(Slip Opinion)", doc_content[:100], msg="Failed to extract content from .pdf file")
         self.assertFalse(
             response.json()["extracted_by_ocr"],
             msg="Failed to extract by OCR",
         )
         self.assertEqual(
             response.json()["page_count"],
-            30,
+            28,
             msg="Failed to extract by OCR",
         )
 
+
     def test_pdf_ocr_extraction(self):
         files = make_file(filename="image-pdf.pdf")
         params = {"ocr_available": True}
@@ -155,6 +150,131 @@ def test_wpd_format(self):
             msg="Failed to extract content from WPD file",
         )
 
+    def test_recap_document_with_content_in_margin(self):
+        """Can we avoid content in the margin and return no content"""
+        filepath = Path("doctor/test_assets/recap_issues/gov.uscourts.cand.16711.581.0.pdf")
+        response = requests.post(
+            url="http://doctor:5050/extract/doc/text/",
+            files={"file": (filepath.name, filepath.read_bytes())},
+            params={
+                "ocr_available": False,
+                "strip_margin": True,
+            },
+        )
+        self.assertEqual(
+            response.json()["err"],
+            "No content",
+            msg=f"Extracted Content for {filepath} but should be blank.",
+        )
+
+    def test_recap_pdf_with_images_and_annotations(self):
+        """Test PDF with images and text annotations"""
+        filepath = Path("doctor/test_assets/recap_issues/gov.uscourts.cand.203343.17.0.pdf")
+        r1 = requests.post(
+            url="http://doctor:5050/extract/doc/text/",
+            files={"file": (filepath.name, filepath.read_bytes())},
+            params={
+                "ocr_available": False,
+                "strip_margin": False,
+            },
+        )
+        self.assertEqual(
+            r1.json()["err"],
+            "PDF contains images",
+            msg=f"Extracted Content for {filepath} but should be blank.",
+        )
+
+    def test_pdf_with_missing_fonts(self):
+        """Test PDF with missing fonts"""
+        filepath = Path("doctor/test_assets/recap_issues/gov.uscourts.nysd.413994.212.0.pdf")
+        r1 = requests.post(
+            url="http://doctor:5050/extract/doc/text/",
+            files={"file": (filepath.name, filepath.read_bytes())},
+            params={
+                "ocr_available": False,
+                "strip_margin": True,
+            },
+        )
+        self.assertEqual(
+            r1.json()["err"],
+            "PDF missing fonts",
+            msg=f"Extracted Content for {filepath} but should be blank.",
+        )
+
+    def test_margin_excluding_recap_documents(self):
+        """Test strip_margin flag will exclude margin bates stamp"""
+        filepath = Path("doctor/test_assets/recap_issues/gov.uscourts.njd.387907.32.0.pdf")
+        r1 = requests.post(
+            url="http://doctor:5050/extract/doc/text/",
+            files={"file": (filepath.name, filepath.read_bytes())},
+            params={
+                "ocr_available": False,
+                "strip_margin": False,
+            },
+        )
+        doc_1 = r1.json()["content"]
+        self.assertIn(
+            "Case 3:18-cv-16281-BRM-TJB",
+            doc_1,
+            msg=f"Bates stamp should be in text {doc_1[:200]}",
+        )
+
+        r2 = requests.post(
+            url="http://doctor:5050/extract/doc/text/",
+            files={"file": (filepath.name, filepath.read_bytes())},
+            params={
+                "ocr_available": False,
+                "strip_margin": True,
+            },
+        )
+        doc_2 = r2.json()["content"]
+        self.assertNotIn(
+            "Case 3:18-cv-16281-BRM-TJB",
+            doc_2,
+            msg=f"Bates stamp should not be in text {doc_2[:200]}",
+        )
+
+    def test_recap_contains_image_page(self):
+        """Can we recognize a partial scan partial text as needing OCR"""
+        filepath = Path("doctor/test_assets/recap_issues/gov.uscourts.nysd.413741.11.0.pdf")
+        response = requests.post(
+            url="http://doctor:5050/extract/doc/text/",
+            files={"file": (filepath.name, filepath.read_bytes())},
+            params={
+                "ocr_available": False,
+                "strip_margin": True,
+            },
+        ).json()
+        self.assertEqual(
+            response["err"],
+            "PDF contains images",
+            msg=f"Extracted Content for {filepath} but should be blank.",
+        )
+
+    def test_skewed_recap_document(self):
+        """Can we remove sideways text in the margin"""
+        filepath = Path("doctor/test_assets/recap_issues/gov.uscourts.cand.16711.199.0.pdf")
+        response = requests.post(
+            url="http://doctor:5050/extract/doc/text/",
+            files={"file": (filepath.name, filepath.read_bytes())},
+            params={
+                "ocr_available": False,
+                "strip_margin": False,
+            },
+        )
+        # The sideways font returns backwards
+        self.assertIn("truoC", response.json()['content'][:50])
+
+        response = requests.post(
+            url="http://doctor:5050/extract/doc/text/",
+            files={"file": (filepath.name, filepath.read_bytes())},
+            params={
+                "ocr_available": False,
+                "strip_margin": True,
+            },
+        )
+        self.assertNotIn("truoC", response.json()['content'][:50])
+
 
 class ThumbnailTests(unittest.TestCase):
     """Can we generate thumbnail images from PDF files"""
@@ -232,7 +352,6 @@ def test_mime_type(self):
             files=files,
             params=params,
         ).json()
-        print(response)
         self.assertEqual(
             response["mimetype"],
             "application/pdf",
@@ -302,7 +421,7 @@ def test_embedding_text_to_image_pdf(self):
         )
         self.assertEqual(
             "",
-            image_response.json()["content"].strip("\x0c\x0c"),
+            image_response.json()["content"].strip(),
             msg="PDF should have no text",
         )
 
@@ -323,7 +442,7 @@ def test_embedding_text_to_image_pdf(self):
                 data=data,
             )
             self.assertIn(
-                "(SlipOpinion)             OCTOBER TERM, 2012",
+                "(SlipOpinion) OCTOBER TERM, 2012",
                 response.json()["content"],
                 msg=f"Got {response.json()}",
             )