Merge pull request #244 from UAlbanyArchives/develop

MSG encoding and WARC page requisites fixes
UAlbanyArchives · Feb 7, 2024 · c650444 · c650444
2 parents 70bf95a + 808a32e
commit c650444
Show file tree

Hide file tree

Showing 6 changed files with 123 additions and 43 deletions.
diff --git a/mailbagit/__init__.py b/mailbagit/__init__.py
@@ -1,7 +1,7 @@
 # __init__.py
 
 # Version of the mailbagit package
-__version__ = "0.7.0"
+__version__ = "0.7.1"
 
 import os
 from pathlib import Path

diff --git a/mailbagit/derivatives/warc.py b/mailbagit/derivatives/warc.py
@@ -61,15 +61,41 @@ def email_external_resources(self, soup):
 
         return external_urls
 
-    def html_external_resources(self, soup):
+    def validate_url(self, url, errors):
+        """
+        Checks if a url is valid and has http/https schema before its requested and raises a warning if invalid or has a different schema.
+
+        Parameters:
+            url(str): A urls found within an email or external html page.
+            errors (List): List of Error objects defined in models.py
+
+        Returns:
+            errors (List): List of Error objects defined in models.py
+        """
+        try:
+            result = urllib.parse.urlparse(url)
+            check = all([result.scheme, result.netloc])
+            if result.scheme.lower().strip().startswith("http"):
+                return True
+            else:
+                desc = f"When writing WARC derivative, skipping URL with non-http/https schema: {url}"
+                errors = common.handle_error(errors, None, desc, "warn")
+                return False
+        except Exception as e:
+            desc = f"When writing WARC derivative, skipping invalid URL: {url}"
+            errors = common.handle_error(errors, None, desc, "warn")
+            return False
+
+    def html_external_resources(self, soup, url):
         """
         Reads an HTML body string and looks for all externally-hosted resources
 
         Parameters:
             soup(obj): A BeautifulSoup object
+            url(str): A string of the URL from where the object was requested
 
         Returns:
-            List: A list of URLs
+            List: A deduplicated list of URLs
         """
         external_urls = []
         # not sure if this is comprehensive but something like "for tag in soup.find_all()"
@@ -92,8 +118,11 @@ def html_external_resources(self, soup):
             for tag in soup.findAll(tag):
                 if tag.get(attr) and tag.get(attr).lower().strip().startswith("http"):
                     external_urls.append(tag.get(attr))
+                else:
+                    full_url = urllib.parse.urljoin(url, tag.get(attr))
+                    external_urls.append(full_url)
 
-        return external_urls
+        return list(dict.fromkeys(external_urls))
 
     def css_external_resources(self, cssText, cssURL):
         """
@@ -104,7 +133,7 @@ def css_external_resources(self, cssText, cssURL):
             cssText(str): A string of CSS
 
         Returns:
-            List: A list of URLs
+            List: A deduplicated list of URLs
         """
 
         external_urls = []
@@ -125,7 +154,51 @@ def css_external_resources(self, cssText, cssURL):
                         else:
                             external_urls.append(urllib.parse.urljoin(cssURL, url))
 
-        return external_urls
+        return list(dict.fromkeys(external_urls))
+
+    def crawl_external_urls(self, session, request_headers, warc_writer, urls, errors):
+        """
+        Reads a list of urls and crawls them and addes them to a WARC file.
+        Parameters:
+            session(str): The requests session
+            request_headers(dict): A dict of request headers.
+            warc_writer(WARCWriter): a warcio WARC writer object for writing pages to a WARC
+            urls(list): A list of urls to crawl and add to a WARC.
+            errors (List): List of Error objects defined in models.py
+
+        Returns:
+            session(str): The requests session
+            warc_writer(WARCWriter): a warcio WARC writer object for writing pages to a WARC
+            url_page_requisites(list): A de-duplicated list page_requisites like CSS and JS that also need to be crawled
+            errors (List): List of Error objects defined in models.py
+        """
+        url_page_requisites = []
+        i = 0
+        while i < len(urls):
+            log.debug("capturing " + urls[i])
+            # validate url
+            if self.validate_url(urls[i], errors):
+                with capture_http(warc_writer):
+                    # First try with SSL verification. If fails, raise a warning and turn off
+                    try:
+                        r = session.get(urls[i], headers=request_headers)
+                        if r.status_code != 200:
+                            desc = f"When writing WARC derivative, HTTP {r.status_code} {r.reason} for external resource {urls[i]}"
+                            errors = common.handle_error(errors, None, desc, "warn")
+                        if "content-type" in r.headers.keys():
+                            if "text/html" in r.headers["content-type"]:
+                                # Gotta get these external resources as well
+                                new_soup = BeautifulSoup(r.text, "html.parser")
+                                new_external_urls = self.html_external_resources(new_soup, r.url)
+                                url_page_requisites.extend(new_external_urls)
+                            elif r.headers["content-type"] == "text/css":
+                                new_external_urls = self.css_external_resources(r.text, r.url)
+                                url_page_requisites.extend(new_external_urls)
+                    except Exception as e:
+                        desc = f"Failed to request external URL for WARC derivatives ({urls[i]})"
+                        errors = common.handle_error(errors, e, desc)
+            i += 1
+        return session, warc_writer, list(dict.fromkeys(url_page_requisites)), errors
 
     def do_task_per_account(self):
         log.debug(self.account.account_data())
@@ -187,7 +260,7 @@ def do_task_per_message(self, message):
                         os.makedirs(out_dir)
 
                     with open(filename, "wb") as output:
-                        writer = WARCWriter(output, gzip=True)
+                        warc_writer = WARCWriter(output, gzip=True)
                         # Write HTML Body
                         try:
                             headers_list = [
@@ -198,15 +271,15 @@ def do_task_per_message(self, message):
                             if message.Date:
                                 headers_list.append(("Last-Modified", message.Date))
                             http_headers = StatusAndHeaders("200 OK", headers_list, protocol="HTTP/1.0")
-                            record = writer.create_warc_record(
+                            record = warc_writer.create_warc_record(
                                 f"{warc_uri}/body.html",
                                 "response",
                                 payload=BytesIO(html_formatted.encode("utf-8")),
                                 length=len(html_formatted.encode("utf-8")),
                                 http_headers=http_headers,
                                 warc_content_type="text/html",
                             )
-                            writer.write_record(record)
+                            warc_writer.write_record(record)
                         except Exception as e:
                             desc = "Error creating WARC response record for HTML body"
                             errors = common.handle_error(errors, e, desc)
@@ -217,29 +290,17 @@ def do_task_per_message(self, message):
                             request_headers = {
                                 "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
                             }
-                            i = 0
-                            while i < len(external_urls):
-                                log.debug("capturing " + external_urls[i])
-                                with capture_http(writer):
-                                    # First try with SSL verification. If fails, raise a warning and turn off
-                                    try:
-                                        r = s.get(external_urls[i], headers=request_headers)
-                                        if r.status_code != 200:
-                                            desc = f"When writing WARC derivative, HTTP {r.status_code} {r.reason} for external resource {external_urls[i]}"
-                                            errors = common.handle_error(errors, None, desc, "warn")
-                                        if "content-type" in r.headers.keys():
-                                            if r.headers["content-type"] == "text/html":
-                                                # Gotta get these external resources as well
-                                                new_soup = BeautifulSoup(r.text, "html.parser")
-                                                new_external_urls = self.html_external_resources(new_soup)
-                                                external_urls.extend(new_external_urls)
-                                            elif r.headers["content-type"] == "text/css":
-                                                new_external_urls = self.css_external_resources(r.text, r.url)
-                                                external_urls.extend(new_external_urls)
-                                    except Exception as e:
-                                        desc = f"Failed to request external URL for WARC derivatives ({external_urls[i]})"
-                                        errors = common.handle_error(errors, e, desc)
-                                i += 1
+
+                            # Crawl external URLs
+                            s, warc_writer, page_requisites, errors = self.crawl_external_urls(
+                                s, request_headers, warc_writer, external_urls, errors
+                            )
+
+                            # Crawl external URL page requisites
+                            s, warc_writer, new_page_requisites, errors = self.crawl_external_urls(
+                                s, request_headers, warc_writer, page_requisites, errors
+                            )
+
                         except Exception as e:
                             desc = "Error capturing external URL in WARC derivative"
                             errors = common.handle_error(errors, e, desc)
@@ -255,15 +316,15 @@ def do_task_per_message(self, message):
                                     ("Date", datetime_to_http_date(datetime.now())),
                                 ]
                                 http_headers = StatusAndHeaders("200 OK", headers_list, protocol="HTTP/1.0")
-                                record = writer.create_warc_record(
+                                record = warc_writer.create_warc_record(
                                     f"{warc_uri}/{quote_plus(attachment.WrittenName)}",
                                     "response",
                                     payload=BytesIO(attachment.File),
                                     length=len(attachment.File),
                                     http_headers=http_headers,
                                     warc_content_type="text/html",
                                 )
-                                writer.write_record(record)
+                                warc_writer.write_record(record)
                         except Exception as e:
                             desc = "Error adding attachments to WARC derivative"
                             errors = common.handle_error(errors, e, desc)
@@ -276,23 +337,23 @@ def do_task_per_message(self, message):
                                 ("Content-Length", str(len(headers_json))),
                             ]
                             http_headers = StatusAndHeaders("200 OK", headers_list, protocol="HTTP/1.0")
-                            record = writer.create_warc_record(
+                            record = warc_writer.create_warc_record(
                                 f"{warc_uri}/headers.json",
                                 "response",
                                 payload=BytesIO(headers_json),
                                 length=len(headers_json),
                                 http_headers=http_headers,
                                 warc_content_type="application/json",
                             )
-                            writer.write_record(record)
-                            record = writer.create_warc_record(
+                            warc_writer.write_record(record)
+                            record = warc_writer.create_warc_record(
                                 f"{warc_uri}/headers.json",
                                 "metadata",
                                 payload=BytesIO(headers_json),
                                 length=len(headers_json),
                                 warc_content_type="application/json",
                             )
-                            writer.write_record(record)
+                            warc_writer.write_record(record)
                         except Exception as e:
                             desc = "Error creating JSON metadata record to WARC derivative"
                             errors = common.handle_error(errors, e, desc)

diff --git a/mailbagit/formats/msg.py b/mailbagit/formats/msg.py
@@ -92,7 +92,21 @@ def messages(self, iteration_only=False):
                 html_encoding = None
                 text_encoding = None
                 # encoding check priorities
-                encodings = {1: {"name": "cp1252", "label": "Windows 1252"}, 2: {"name": "utf-8", "label": "utf-8"}}
+                encodings = {}
+                """
+                The listed values are apparently unreliable for HTML bodies.
+                Thus with the encodings dict empty, chardet will be used, which is apparently the least bad option.
+                try:
+                    LIBPFF_ENTRY_TYPE_MESSAGE_BODY_CODEPAGE = int("0x3fde", base=16)
+                    LIBPFF_ENTRY_TYPE_MESSAGE_CODEPAGE = int("0x3ffd", base=16)
+                    message_body_codepage = extract_msg.encoding._CODE_PAGES[mail.getPropertyVal(LIBPFF_ENTRY_TYPE_MESSAGE_BODY_CODEPAGE)]
+                    message_codepage = extract_msg.encoding._CODE_PAGES[mail.getPropertyVal(LIBPFF_ENTRY_TYPE_MESSAGE_CODEPAGE)]
+                    encodings[1] = {"name": message_body_codepage, "label": "PidTagInternetCodepage"}
+                    encodings[2] = {"name": message_codepage, "label": "PidTagMessageCodepage"}
+                except:
+                    desc = "Error reading codepages"
+                    errors = common.handle_error(errors, e, desc)
+                """
                 try:
                     try:
                         if mail.htmlBody:

diff --git a/mailbagit/helper/derivative.py b/mailbagit/helper/derivative.py
@@ -214,15 +214,20 @@ def htmlFormatting(message, external_css, headers=True):
         # HT to extract_msg for this approach
         # https://github.com/TeamMsgExtractor/msg-extractor/blob/6bed8213de1a7a41739fcf5c9363322508711fce/extract_msg/message_base.py#L403-L414
         tags = (tag for tag in soup.findAll("img") if tag.get("src") and tag.get("src").startswith("cid:"))
-        data = None
         for tag in tags:
             # Iterate through the attachments until we get the right one.
+            data = None
             cid = tag["src"][4:]
 
             for attachment in message.Attachments:
                 if attachment.Name:
                     if attachment.Name in cid:
                         data = attachment.File
+            if data == None:
+                for attachment in message.Attachments:
+                    if attachment.Content_ID:
+                        if attachment.Content_ID in cid:
+                            data = attachment.File
 
             # If we found anything, inject it.
             if data:

diff --git a/mailbagit/helper/format.py b/mailbagit/helper/format.py
@@ -67,7 +67,7 @@ def safely_decode(body_type, binary_text, encodings, errors):
             try:
                 valid_encoding = codecs.lookup(encodings[priority]["name"]).name.lower()
                 valid.append(valid_encoding)
-                text = binary_text.decode(valid_encoding)
+                text = binary_text.decode(valid_encoding, errors="strict")
                 used = encodings[priority]["name"]
                 success = True
                 break
@@ -78,7 +78,7 @@ def safely_decode(body_type, binary_text, encodings, errors):
     if success == False:
         try:
             detected = chardet.detect(binary_text)["encoding"]
-            text = binary_text.decode(detected)
+            text = binary_text.decode(detected, errors="strict")
             used = detected
             if len(valid) < 1:
                 # desc = "No valid listed encodings, but successfully decoded " + body_type + " body with detected encoding " + detected

diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
 
 setuptools.setup(
     name="mailbagit",
-    version="0.7.0",
+    version="0.7.1",
     author="Gregory Wiedeman",
     author_email="[email protected]",
     description="A tool for preserving email in multiple preservation formats.",