diff --git a/docs/installing_mailbagit/pdf.md b/docs/installing_mailbagit/pdf.md index 9d6569f..d1108ac 100755 --- a/docs/installing_mailbagit/pdf.md +++ b/docs/installing_mailbagit/pdf.md @@ -20,7 +20,7 @@ Installing [wkhtmltopdf](https://wkhtmltopdf.org/) and adding `wkhtmltopdf` or ` ## Chrome Headless -Installing [Google Chrome](https://www.google.com/chrome/) and adding `chrome`, `chrome.exe` or `google-chrome` to your `PATH` will make the `pdf-chrome` derivative option available. If you have Google Chrome installed already, you may just need to add it to your `PATH`. +Installing [Google Chrome](https://www.google.com/chrome/) and adding `chrome`, `chrome.exe,` `google-chrome,` or `Google Chrome` to your `PATH` will make the `pdf-chrome` derivative option available. If you have Google Chrome installed already, you may just need to add it to your `PATH`. For Windows, Chrome usually installs in one of these locations by default: ``` @@ -28,12 +28,15 @@ C:\Program Files\Google\Chrome\Application C:\Program Files (x86)\Google\Chrome\Application ``` +For Macs, the usual install path is `/Applications/Google\ Chrome.app/Contents/MacOS/`. + You can test if Chrome is correctly added to your `PATH` by entering the correlating command into a command line terminal: ``` chrome https://archives.albany.edu/mailbag chrome.exe https://archives.albany.edu/mailbag google-chrome https://archives.albany.edu/mailbag +Google\ Chrome https://archives.albany.edu/mailbag ``` If any of these commands open a Chrome browser window, you're all set! diff --git a/mailbagit/__init__.py b/mailbagit/__init__.py index 6153deb..b134e99 100644 --- a/mailbagit/__init__.py +++ b/mailbagit/__init__.py @@ -1,7 +1,7 @@ # __init__.py # Version of the mailbagit package -__version__ = "0.7.1" +__version__ = "0.7.2" import os from pathlib import Path diff --git a/mailbagit/derivatives/pdf_chrome.py b/mailbagit/derivatives/pdf_chrome.py index 18dbfda..abc4a1e 100644 --- a/mailbagit/derivatives/pdf_chrome.py +++ b/mailbagit/derivatives/pdf_chrome.py @@ -10,7 +10,7 @@ skip_registry = False try: - chromes = ["google-chrome", "chrome.exe", "chrome"] + chromes = ["google-chrome", "Google Chrome", "chrome.exe", "chrome"] chrome = next((c for c in chromes if distutils.spawn.find_executable(c)), None) skip_registry = True if chrome is None else False @@ -72,7 +72,7 @@ def do_task_per_message(self, message): "--headless", "--run-all-compositor-stages-before-draw", "--disable-gpu", - "--print-to-pdf-no-header", + "--no-pdf-header-footer", "--print-to-pdf=" + os.path.abspath(pdf_name), os.path.abspath(html_name), ] diff --git a/mailbagit/formats/pst.py b/mailbagit/formats/pst.py index b30b889..23b8ec6 100644 --- a/mailbagit/formats/pst.py +++ b/mailbagit/formats/pst.py @@ -45,6 +45,7 @@ def __init__(self, args, source_parent_dir, mailbag_dir, mailbag_name, **kwargs) self.source_parent_dir = source_parent_dir self.companion_files = args.companion_files log.info("Reading: " + self.path) + self.count = 0 @property def account_data(self): @@ -57,7 +58,7 @@ def number_of_messages(self): count += 1 return count - def folders(self, folder, path, originalFile, errors, iteration_only=False): + def folders(self, folder, path, originalFile, iteration_only=False): # recursive function that calls itself on any subfolders and # returns a generator of messages # path is the email folder path of the message, separated by "/" @@ -69,6 +70,7 @@ def folders(self, folder, path, originalFile, errors, iteration_only=False): yield None continue attachments = [] + errors = [] try: messageObj = folder.get_sub_message(index) @@ -124,11 +126,28 @@ def folders(self, folder, path, originalFile, errors, iteration_only=False): rtf_body = rtf_body[:-1] # decode it before using DeEncapsulator rtf_string, html_encoding, errors = format.safely_decode("HTML", rtf_body, encodings, errors) - deencapsulated_body = DeEncapsulator(rtf_string) - deencapsulated_body.deencapsulate() - html_body = deencapsulated_body.html - except: - pass + + # Some sort of encoding issue can cause multiple EOF characters which is malformed RTF + """ + eof_index = rtf_string.find('\x1a') + self.count += 1 + if eof_index != -1: + print (rtf_string.count('\x1a')) + """ + + try: + deencapsulated_body = DeEncapsulator(rtf_body) + deencapsulated_body.deencapsulate() + html_body, html_encoding, errors = format.safely_decode( + "HTML", deencapsulated_body.html, encodings, errors + ) + # html_body = deencapsulated_body.html.decode(html_encoding) + except Exception as e: + desc = "Error parsing RTF body" + errors = common.handle_error(errors, e, desc) + except Exception as e: + desc = "Error parsing HTML or RTF body" + errors = common.handle_error(errors, e, desc) if messageObj.plain_text_body: encodings[len(encodings.keys()) + 1] = { "name": "utf-8", @@ -282,7 +301,7 @@ def folders(self, folder, path, originalFile, errors, iteration_only=False): if folder.number_of_sub_folders: for folder_index in range(folder.number_of_sub_folders): subfolder = folder.get_sub_folder(folder_index) - yield from self.folders(subfolder, path + "/" + subfolder.name, originalFile, errors, iteration_only=iteration_only) + yield from self.folders(subfolder, path + "/" + subfolder.name, originalFile, iteration_only=iteration_only) else: if not iteration_only: if not folder.number_of_sub_messages: @@ -320,11 +339,10 @@ def messages(self, iteration_only=False): pst = pypff.file() pst.open(filePath) root = pst.get_root_folder() - errors = [] for folder in root.sub_folders: if folder.number_of_sub_folders: # call recursive function to parse email folder - yield from self.folders(folder, folder.name, originalFile, errors, iteration_only=iteration_only) + yield from self.folders(folder, folder.name, originalFile, iteration_only=iteration_only) else: if not iteration_only: # This is an email folder that does not contain any messages. diff --git a/mailbagit/helper/controller.py b/mailbagit/helper/controller.py index e9a7f34..860c0c8 100644 --- a/mailbagit/helper/controller.py +++ b/mailbagit/helper/controller.py @@ -91,7 +91,7 @@ def writeAttachmentsToDisk(dry_run, attachments_dir, message): if attachment.Name.lower() == "attachments.csv": writtenName = attachment.WrittenName + os.path.splitext(attachment.Name)[1] desc = "" - errors = common.handle_error(errors, None, desc, "warn") + errors = common.handle_error([], None, desc, "warn") else: writtenName = attachment.WrittenName attachment_row = [attachment.Name, writtenName, attachment.MimeType, attachment.Content_ID] @@ -114,7 +114,7 @@ def writeAttachmentsToDisk(dry_run, attachments_dir, message): desc = ( f"Failed to write attachment {attachment.Name} even as normalized name {writtenName}. Instead writing as {random_name}." ) - errors = common.handle_error(errors, None, desc, "error") + errors = common.handle_error([], None, desc, "error") attachment_row = [attachment.Name, random_name, attachment.MimeType, attachment.Content_ID] attachment_path = os.path.join(message_attachments_dir, random_name) f = open(attachment_path, "wb") diff --git a/setup.py b/setup.py index c6ef45c..794fdff 100755 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setuptools.setup( name="mailbagit", - version="0.7.1", + version="0.7.2", author="Gregory Wiedeman", author_email="gwiedeman@albany.edu", description="A tool for preserving email in multiple preservation formats.",