Skip to content

Commit

Permalink
Merge pull request #252 from UAlbanyArchives/develop
Browse files Browse the repository at this point in the history
Fix PST RTF bodies, controller error handling
  • Loading branch information
gwiedeman authored May 3, 2024
2 parents c650444 + b3e1791 commit 857874d
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 16 deletions.
5 changes: 4 additions & 1 deletion docs/installing_mailbagit/pdf.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,23 @@ Installing [wkhtmltopdf](https://wkhtmltopdf.org/) and adding `wkhtmltopdf` or `

## Chrome Headless

Installing [Google Chrome](https://www.google.com/chrome/) and adding `chrome`, `chrome.exe` or `google-chrome` to your `PATH` will make the `pdf-chrome` derivative option available. If you have Google Chrome installed already, you may just need to add it to your `PATH`.
Installing [Google Chrome](https://www.google.com/chrome/) and adding `chrome`, `chrome.exe,` `google-chrome,` or `Google Chrome` to your `PATH` will make the `pdf-chrome` derivative option available. If you have Google Chrome installed already, you may just need to add it to your `PATH`.

For Windows, Chrome usually installs in one of these locations by default:
```
C:\Program Files\Google\Chrome\Application
C:\Program Files (x86)\Google\Chrome\Application
```

For Macs, the usual install path is `/Applications/Google\ Chrome.app/Contents/MacOS/`.

You can test if Chrome is correctly added to your `PATH` by entering the correlating command into a command line terminal:

```
chrome https://archives.albany.edu/mailbag
chrome.exe https://archives.albany.edu/mailbag
google-chrome https://archives.albany.edu/mailbag
Google\ Chrome https://archives.albany.edu/mailbag
```

If any of these commands open a Chrome browser window, you're all set!
2 changes: 1 addition & 1 deletion mailbagit/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# __init__.py

# Version of the mailbagit package
__version__ = "0.7.1"
__version__ = "0.7.2"

import os
from pathlib import Path
Expand Down
4 changes: 2 additions & 2 deletions mailbagit/derivatives/pdf_chrome.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
skip_registry = False

try:
chromes = ["google-chrome", "chrome.exe", "chrome"]
chromes = ["google-chrome", "Google Chrome", "chrome.exe", "chrome"]
chrome = next((c for c in chromes if distutils.spawn.find_executable(c)), None)
skip_registry = True if chrome is None else False

Expand Down Expand Up @@ -72,7 +72,7 @@ def do_task_per_message(self, message):
"--headless",
"--run-all-compositor-stages-before-draw",
"--disable-gpu",
"--print-to-pdf-no-header",
"--no-pdf-header-footer",
"--print-to-pdf=" + os.path.abspath(pdf_name),
os.path.abspath(html_name),
]
Expand Down
36 changes: 27 additions & 9 deletions mailbagit/formats/pst.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def __init__(self, args, source_parent_dir, mailbag_dir, mailbag_name, **kwargs)
self.source_parent_dir = source_parent_dir
self.companion_files = args.companion_files
log.info("Reading: " + self.path)
self.count = 0

@property
def account_data(self):
Expand All @@ -57,7 +58,7 @@ def number_of_messages(self):
count += 1
return count

def folders(self, folder, path, originalFile, errors, iteration_only=False):
def folders(self, folder, path, originalFile, iteration_only=False):
# recursive function that calls itself on any subfolders and
# returns a generator of messages
# path is the email folder path of the message, separated by "/"
Expand All @@ -69,6 +70,7 @@ def folders(self, folder, path, originalFile, errors, iteration_only=False):
yield None
continue
attachments = []
errors = []
try:
messageObj = folder.get_sub_message(index)

Expand Down Expand Up @@ -124,11 +126,28 @@ def folders(self, folder, path, originalFile, errors, iteration_only=False):
rtf_body = rtf_body[:-1]
# decode it before using DeEncapsulator
rtf_string, html_encoding, errors = format.safely_decode("HTML", rtf_body, encodings, errors)
deencapsulated_body = DeEncapsulator(rtf_string)
deencapsulated_body.deencapsulate()
html_body = deencapsulated_body.html
except:
pass

# Some sort of encoding issue can cause multiple EOF characters which is malformed RTF
"""
eof_index = rtf_string.find('\x1a')
self.count += 1
if eof_index != -1:
print (rtf_string.count('\x1a'))
"""

try:
deencapsulated_body = DeEncapsulator(rtf_body)
deencapsulated_body.deencapsulate()
html_body, html_encoding, errors = format.safely_decode(
"HTML", deencapsulated_body.html, encodings, errors
)
# html_body = deencapsulated_body.html.decode(html_encoding)
except Exception as e:
desc = "Error parsing RTF body"
errors = common.handle_error(errors, e, desc)
except Exception as e:
desc = "Error parsing HTML or RTF body"
errors = common.handle_error(errors, e, desc)
if messageObj.plain_text_body:
encodings[len(encodings.keys()) + 1] = {
"name": "utf-8",
Expand Down Expand Up @@ -282,7 +301,7 @@ def folders(self, folder, path, originalFile, errors, iteration_only=False):
if folder.number_of_sub_folders:
for folder_index in range(folder.number_of_sub_folders):
subfolder = folder.get_sub_folder(folder_index)
yield from self.folders(subfolder, path + "/" + subfolder.name, originalFile, errors, iteration_only=iteration_only)
yield from self.folders(subfolder, path + "/" + subfolder.name, originalFile, iteration_only=iteration_only)
else:
if not iteration_only:
if not folder.number_of_sub_messages:
Expand Down Expand Up @@ -320,11 +339,10 @@ def messages(self, iteration_only=False):
pst = pypff.file()
pst.open(filePath)
root = pst.get_root_folder()
errors = []
for folder in root.sub_folders:
if folder.number_of_sub_folders:
# call recursive function to parse email folder
yield from self.folders(folder, folder.name, originalFile, errors, iteration_only=iteration_only)
yield from self.folders(folder, folder.name, originalFile, iteration_only=iteration_only)
else:
if not iteration_only:
# This is an email folder that does not contain any messages.
Expand Down
4 changes: 2 additions & 2 deletions mailbagit/helper/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def writeAttachmentsToDisk(dry_run, attachments_dir, message):
if attachment.Name.lower() == "attachments.csv":
writtenName = attachment.WrittenName + os.path.splitext(attachment.Name)[1]
desc = ""
errors = common.handle_error(errors, None, desc, "warn")
errors = common.handle_error([], None, desc, "warn")
else:
writtenName = attachment.WrittenName
attachment_row = [attachment.Name, writtenName, attachment.MimeType, attachment.Content_ID]
Expand All @@ -114,7 +114,7 @@ def writeAttachmentsToDisk(dry_run, attachments_dir, message):
desc = (
f"Failed to write attachment {attachment.Name} even as normalized name {writtenName}. Instead writing as {random_name}."
)
errors = common.handle_error(errors, None, desc, "error")
errors = common.handle_error([], None, desc, "error")
attachment_row = [attachment.Name, random_name, attachment.MimeType, attachment.Content_ID]
attachment_path = os.path.join(message_attachments_dir, random_name)
f = open(attachment_path, "wb")
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

setuptools.setup(
name="mailbagit",
version="0.7.1",
version="0.7.2",
author="Gregory Wiedeman",
author_email="[email protected]",
description="A tool for preserving email in multiple preservation formats.",
Expand Down

0 comments on commit 857874d

Please sign in to comment.