Skip to content

Commit

Permalink
Merge pull request #138 from SuffolkLITLab/change-dependencies
Browse files Browse the repository at this point in the history
Unpin scikit-learn because it conflicts with docassemble
  • Loading branch information
nonprofittechy authored Jul 11, 2024
2 parents c50a8a0 + 2617c72 commit 81356ca
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 41 deletions.
79 changes: 43 additions & 36 deletions formfyxer/docx_wrangling.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@

def add_paragraph_after(paragraph, text):
p = OxmlElement("w:p")
r = OxmlElement('w:r')
t = OxmlElement('w:t')
r = OxmlElement("w:r")
t = OxmlElement("w:t")
t.text = text

r.append(t)
Expand All @@ -33,17 +33,18 @@ def add_paragraph_after(paragraph, text):

def add_paragraph_before(paragraph, text):
p = OxmlElement("w:p")
r = OxmlElement('w:r')
t = OxmlElement('w:t')
r = OxmlElement("w:r")
t = OxmlElement("w:t")
t.text = text

r.append(t)
p.append(r)
paragraph._element.addprevious(p)


def add_run_after(run, text):
r = OxmlElement('w:r')
t = OxmlElement('w:t')
r = OxmlElement("w:r")
t = OxmlElement("w:t")
t.text = text

r.append(t)
Expand All @@ -55,8 +56,8 @@ def update_docx(
) -> docx.document.Document:
"""Update the document with the modified runs.
Note: OpenAI is probabilistic, so the modified run indices may not be correct.
When the index of a run or paragraph is out of range, a new paragraph
Note: OpenAI is probabilistic, so the modified run indices may not be correct.
When the index of a run or paragraph is out of range, a new paragraph
will be inserted at the end of the document or a new run at the end of the
paragraph's runs.
Expand Down Expand Up @@ -88,19 +89,22 @@ def update_docx(
continue
run = paragraph.runs[run_number]
if new_paragraph == 1:
add_paragraph_after(paragraph, modified_text)
add_paragraph_after(paragraph, modified_text)
elif new_paragraph == -1:
add_paragraph_before(paragraph, modified_text)
add_paragraph_before(paragraph, modified_text)
else:
run.text = modified_text
return document

def get_docx_repr(docx_path: str, paragraph_start:int=0, paragraph_end:Optional[int]=None):

def get_docx_repr(
docx_path: str, paragraph_start: int = 0, paragraph_end: Optional[int] = None
):
"""Return a JSON representation of the paragraphs and runs in the DOCX file.
Args:
docx_path: path to the DOCX file
Returns:
A JSON representation of the paragraphs and runs in the DOCX file.
"""
Expand All @@ -117,9 +121,10 @@ def get_docx_repr(docx_path: str, paragraph_start:int=0, paragraph_end:Optional[
)
return repr(items)


def get_labeled_docx_runs(
docx_path: Optional[str] = None,
docx_repr = Optional[str],
docx_repr=Optional[str],
custom_people_names: Optional[Tuple[str, str]] = None,
openai_client: Optional[OpenAI] = None,
api_key: Optional[str] = None,
Expand Down Expand Up @@ -264,22 +269,23 @@ def get_labeled_docx_runs(
"(State the reason for eviction)" transforms into `{{ eviction_reason }}`.
"""
return get_modified_docx_runs(
docx_path = docx_path,
docx_repr = docx_repr,
docx_path=docx_path,
docx_repr=docx_repr,
custom_example=custom_example,
instructions=instructions,
openai_client=openai_client,
api_key=api_key,
)


def get_modified_docx_runs(
docx_path: Optional[str] = None,
docx_repr: Optional[str] = None,
custom_example:str = "",
instructions:str = "",
openai_client: Optional[OpenAI] = None,
api_key:Optional[str]=None,
temperature=0.5,
docx_path: Optional[str] = None,
docx_repr: Optional[str] = None,
custom_example: str = "",
instructions: str = "",
openai_client: Optional[OpenAI] = None,
api_key: Optional[str] = None,
temperature=0.5,
) -> List[Tuple[int, int, str, int]]:
"""Use GPT to rewrite the contents of a DOCX file paragraph by paragraph. Does not handle tables, footers, or
other structures yet.
Expand All @@ -301,9 +307,9 @@ def get_modified_docx_runs(
[1, 0, "I hope this letter finds you well."],
]
Your custom instructions should include an example of how the sample will be modified, like the one below:
Example reply, indicating paragraph, run, the new text, and a number indicating if this changes the
Your custom instructions should include an example of how the sample will be modified, like the one below:
Example reply, indicating paragraph, run, the new text, and a number indicating if this changes the
current paragraph, adds one before, or adds one after (-1, 0, 1):
{"results":
Expand Down Expand Up @@ -336,9 +342,7 @@ def get_modified_docx_runs(
assert isinstance(docx_repr, str)

if not openai_client:
openai_client = OpenAI(
api_key = api_key or os.environ.get("OPENAI_API_KEY")
)
openai_client = OpenAI(api_key=api_key or os.environ.get("OPENAI_API_KEY"))

if not custom_example:
custom_example = """[
Expand All @@ -347,7 +351,9 @@ def get_modified_docx_runs(
[1, 0, "I hope this letter finds you well."],
]"""

if not "[" in instructions: # Make sure we have at least a minimal example of the output
if (
not "[" in instructions
): # Make sure we have at least a minimal example of the output
instructions += """The result will look like this:
{"results":
Expand All @@ -357,7 +363,7 @@ def get_modified_docx_runs(
]
}
"""

role_description = f"""
You will process a DOCX document and return a JSON structure that transforms the DOCX file
based on the following guidelines and examples. The DOCX will be provided as an annotated series of
Expand Down Expand Up @@ -386,11 +392,11 @@ def get_modified_docx_runs(
f"Input to OpenAI is too long ({token_count} tokens). Maximum is 128000 tokens."
)

moderation_response = openai_client.moderations.create(input=role_description + docx_repr)
moderation_response = openai_client.moderations.create(
input=role_description + docx_repr
)
if moderation_response.results[0].flagged:
raise Exception(
f"OpenAI moderation error: {moderation_response.results[0]}"
)
raise Exception(f"OpenAI moderation error: {moderation_response.results[0]}")

response = openai_client.chat.completions.create(
model="gpt-4-1106-preview",
Expand All @@ -416,6 +422,7 @@ def get_modified_docx_runs(
guesses = json.loads(response.choices[0].message.content)["results"]
return guesses


def make_docx_plain_language(docx_path: str) -> docx.document.Document:
"""
Convert a DOCX file to plain language with the help of OpenAI.
Expand All @@ -439,10 +446,10 @@ def make_docx_plain_language(docx_path: str) -> docx.document.Document:
]
}
""",

)
return update_docx(docx.Document(docx_path), guesses)


def modify_docx_with_openai_guesses(docx_path: str) -> docx.document.Document:
"""Uses OpenAI to guess the variable names for a document and then modifies the document with the guesses.
Expand All @@ -459,4 +466,4 @@ def modify_docx_with_openai_guesses(docx_path: str) -> docx.document.Document:

if __name__ == "__main__":
new_doc = modify_docx_with_openai_guesses(sys.argv[1])
new_doc.save(sys.argv[1] + ".output.docx")
new_doc.save(sys.argv[1] + ".output.docx")
6 changes: 4 additions & 2 deletions formfyxer/pdf_wrangling.py
Original file line number Diff line number Diff line change
Expand Up @@ -1234,10 +1234,12 @@ def sort_contours(cnts, method: str = "left-to-right"):
# construct tuple of bounding boxes and sort them top to bottom
boundingBoxes = tuple(cv2.boundingRect(c) for c in cnts)
if not boundingBoxes:
return (),()
return (), ()

# Sort the contours and bounding boxes
sorted_zip = sorted(zip(cnts, boundingBoxes), key=lambda b: b[1][coord], reverse=reverse)
sorted_zip = sorted(
zip(cnts, boundingBoxes), key=lambda b: b[1][coord], reverse=reverse
)

if not sorted_zip:
return (), ()
Expand Down
4 changes: 2 additions & 2 deletions formfyxer/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@ pikepdf
reportlab
requests
ocrmypdf
scikit-learn==1.2.2
scikit-learn
spacy
textstat
transformers
types-requests
types-PyYAML
en-core-web-lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0-py3-none-any.whl
typer>=0.4.1,<0.5.0
python-docx
python-docx
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def run(self):
license='MIT',
packages=['formfyxer'],
install_requires=['spacy', 'pdfminer.six', 'pandas', 'pikepdf',
'textstat', 'requests', 'numpy<2.0.0', 'scikit-learn==1.2.2', 'networkx', 'joblib',
'textstat', 'requests', 'numpy', 'scikit-learn', 'networkx', 'joblib',
'nltk', 'boxdetect', 'pdf2image', 'reportlab>=3.6.13', 'pdfminer.six',
'opencv-python', 'ocrmypdf', 'eyecite', 'passivepy>=0.2.16', 'sigfig',
'typer>=0.4.1,<0.5.0', # typer pre 0.4.1 was broken by click 8.1.0: https://github.com/explosion/spaCy/issues/10564
Expand Down

0 comments on commit 81356ca

Please sign in to comment.