Skip to content

Commit

Permalink
tests(extraction): Add and fix tests
Browse files Browse the repository at this point in the history
Added and fixed tests
Modified one test pdf to better reflect the test
  • Loading branch information
flooie committed Apr 29, 2024
1 parent a91bc95 commit 0260927
Show file tree
Hide file tree
Showing 8 changed files with 135 additions and 16 deletions.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file modified doctor/test_assets/vector-pdf.pdf
Binary file not shown.
151 changes: 135 additions & 16 deletions doctor/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,37 +32,32 @@ def test_pdf_to_text(self):
response = requests.post(
"http://doctor:5050/extract/doc/text/", files=files, data=data
)
text = response.json()["content"][:100].replace("\n", "").strip()
text = response.json()["content"].strip()[:200]
self.assertEqual(200, response.status_code, msg="Wrong status code")
self.assertEqual(
text,
"(Slip Opinion) OCTOBER TERM, 2012 1",
msg=text,
)
self.assertIn("(Slip Opinion)", text, msg="Text not found")

def test_content_extraction(self):
""""""
"""Test if we can extract text from a PDF"""

files = make_file(filename="vector-pdf.pdf")
data = {"ocr_available": False}
response = requests.post(
"http://doctor:5050/extract/doc/text/", files=files, data=data
)
doc_content = response.json()['content']
self.assertTrue(response.ok, msg="Content extraction failed")
self.assertEqual(
response.json()["content"][:100].replace("\n", "").strip(),
"(Slip Opinion) OCTOBER TERM, 2012 1",
msg="Failed to extract content from .pdf file",
)
self.assertIn("(Slip Opinion)", doc_content[:100], msg="Failed to extract content from .pdf file")
self.assertFalse(
response.json()["extracted_by_ocr"],
msg="Failed to extract by OCR",
)
self.assertEqual(
response.json()["page_count"],
30,
28,
msg="Failed to extract by OCR",
)


def test_pdf_ocr_extraction(self):
files = make_file(filename="image-pdf.pdf")
params = {"ocr_available": True}
Expand Down Expand Up @@ -155,6 +150,131 @@ def test_wpd_format(self):
msg="Failed to extract content from WPD file",
)

def test_recap_document_with_content_in_margin(self):
"""Can we avoid content in the margin and return no content"""
filepath = Path("doctor/test_assets/recap_issues/gov.uscourts.cand.16711.581.0.pdf")
response = requests.post(
url="http://doctor:5050/extract/doc/text/",
files={"file": (filepath.name, filepath.read_bytes())},
params={
"ocr_available": False,
"strip_margin": True,
},
)
self.assertEqual(
response.json()["err"],
"No content",
msg=f"Extracted Content for {filepath} but should be blank.",
)

def test_recap_pdf_with_images_and_annotations(self):
"""Test PDF with images and text annotations"""
filepath = Path("doctor/test_assets/recap_issues/gov.uscourts.cand.203343.17.0.pdf")
r1 = requests.post(
url="http://doctor:5050/extract/doc/text/",
files={"file": (filepath.name, filepath.read_bytes())},
params={
"ocr_available": False,
"strip_margin": False,
},
)
self.assertEqual(
r1.json()["err"],
"PDF contains images",
msg=f"Extracted Content for {filepath} but should be blank.",
)

def test_pdf_with_missing_fonts(self):
"""Test PDF with missing fonts"""
filepath = Path("doctor/test_assets/recap_issues/gov.uscourts.nysd.413994.212.0.pdf")
r1 = requests.post(
url="http://doctor:5050/extract/doc/text/",
files={"file": (filepath.name, filepath.read_bytes())},
params={
"ocr_available": False,
"strip_margin": True,
},
)
self.assertEqual(
r1.json()["err"],
"PDF missing fonts",
msg=f"Extracted Content for {filepath} but should be blank.",
)

def test_margin_excluding_recap_documents(self):
"""Test strip_margin flag will exclude margin bates stamp"""
filepath = Path("doctor/test_assets/recap_issues/gov.uscourts.njd.387907.32.0.pdf")
r1 = requests.post(
url="http://doctor:5050/extract/doc/text/",
files={"file": (filepath.name, filepath.read_bytes())},
params={
"ocr_available": False,
"strip_margin": False,
},
)
doc_1 = r1.json()["content"]
self.assertIn(
"Case 3:18-cv-16281-BRM-TJB",
doc_1,
msg=f"Bates stamp should be in text {doc_1[:200]}",
)

r2 = requests.post(
url="http://doctor:5050/extract/doc/text/",
files={"file": (filepath.name, filepath.read_bytes())},
params={
"ocr_available": False,
"strip_margin": True,
},
)
doc_2 = r2.json()["content"]
self.assertNotIn(
"Case 3:18-cv-16281-BRM-TJB",
doc_2,
msg=f"Bates stamp should not be in text {doc_2[:200]}",
)

def test_recap_contains_image_page(self):
"""Can we recognize a partial scan partial text as needing OCR"""
filepath = Path("doctor/test_assets/recap_issues/gov.uscourts.nysd.413741.11.0.pdf")
response = requests.post(
url="http://doctor:5050/extract/doc/text/",
files={"file": (filepath.name, filepath.read_bytes())},
params={
"ocr_available": False,
"strip_margin": True,
},
).json()
self.assertEqual(
response["err"],
"PDF contains images",
msg=f"Extracted Content for {filepath} but should be blank.",
)

def test_skewed_recap_document(self):
"""Can we remove sideways text in the margin"""
filepath = Path("doctor/test_assets/recap_issues/gov.uscourts.cand.16711.199.0.pdf")
response = requests.post(
url="http://doctor:5050/extract/doc/text/",
files={"file": (filepath.name, filepath.read_bytes())},
params={
"ocr_available": False,
"strip_margin": False,
},
)
# The sideways font returns backwards
self.assertIn("truoC", response.json()['content'][:50])

response = requests.post(
url="http://doctor:5050/extract/doc/text/",
files={"file": (filepath.name, filepath.read_bytes())},
params={
"ocr_available": False,
"strip_margin": True,
},
)
self.assertNotIn("truoC", response.json()['content'][:50])


class ThumbnailTests(unittest.TestCase):
"""Can we generate thumbnail images from PDF files"""
Expand Down Expand Up @@ -232,7 +352,6 @@ def test_mime_type(self):
files=files,
params=params,
).json()
print(response)
self.assertEqual(
response["mimetype"],
"application/pdf",
Expand Down Expand Up @@ -302,7 +421,7 @@ def test_embedding_text_to_image_pdf(self):
)
self.assertEqual(
"",
image_response.json()["content"].strip("\x0c\x0c"),
image_response.json()["content"].strip(),
msg="PDF should have no text",
)

Expand All @@ -323,7 +442,7 @@ def test_embedding_text_to_image_pdf(self):
data=data,
)
self.assertIn(
"(SlipOpinion) OCTOBER TERM, 2012",
"(SlipOpinion) OCTOBER TERM, 2012",
response.json()["content"],
msg=f"Got {response.json()}",
)
Expand Down

0 comments on commit 0260927

Please sign in to comment.