diff --git a/mailbagit/__init__.py b/mailbagit/__init__.py index 88ab1eb..6153deb 100644 --- a/mailbagit/__init__.py +++ b/mailbagit/__init__.py @@ -1,7 +1,7 @@ # __init__.py # Version of the mailbagit package -__version__ = "0.7.0" +__version__ = "0.7.1" import os from pathlib import Path diff --git a/mailbagit/derivatives/warc.py b/mailbagit/derivatives/warc.py index d8b85ca..2ccf6a5 100644 --- a/mailbagit/derivatives/warc.py +++ b/mailbagit/derivatives/warc.py @@ -61,15 +61,41 @@ def email_external_resources(self, soup): return external_urls - def html_external_resources(self, soup): + def validate_url(self, url, errors): + """ + Checks if a url is valid and has http/https schema before its requested and raises a warning if invalid or has a different schema. + + Parameters: + url(str): A urls found within an email or external html page. + errors (List): List of Error objects defined in models.py + + Returns: + errors (List): List of Error objects defined in models.py + """ + try: + result = urllib.parse.urlparse(url) + check = all([result.scheme, result.netloc]) + if result.scheme.lower().strip().startswith("http"): + return True + else: + desc = f"When writing WARC derivative, skipping URL with non-http/https schema: {url}" + errors = common.handle_error(errors, None, desc, "warn") + return False + except Exception as e: + desc = f"When writing WARC derivative, skipping invalid URL: {url}" + errors = common.handle_error(errors, None, desc, "warn") + return False + + def html_external_resources(self, soup, url): """ Reads an HTML body string and looks for all externally-hosted resources Parameters: soup(obj): A BeautifulSoup object + url(str): A string of the URL from where the object was requested Returns: - List: A list of URLs + List: A deduplicated list of URLs """ external_urls = [] # not sure if this is comprehensive but something like "for tag in soup.find_all()" @@ -92,8 +118,11 @@ def html_external_resources(self, soup): for tag in soup.findAll(tag): if tag.get(attr) and tag.get(attr).lower().strip().startswith("http"): external_urls.append(tag.get(attr)) + else: + full_url = urllib.parse.urljoin(url, tag.get(attr)) + external_urls.append(full_url) - return external_urls + return list(dict.fromkeys(external_urls)) def css_external_resources(self, cssText, cssURL): """ @@ -104,7 +133,7 @@ def css_external_resources(self, cssText, cssURL): cssText(str): A string of CSS Returns: - List: A list of URLs + List: A deduplicated list of URLs """ external_urls = [] @@ -125,7 +154,51 @@ def css_external_resources(self, cssText, cssURL): else: external_urls.append(urllib.parse.urljoin(cssURL, url)) - return external_urls + return list(dict.fromkeys(external_urls)) + + def crawl_external_urls(self, session, request_headers, warc_writer, urls, errors): + """ + Reads a list of urls and crawls them and addes them to a WARC file. + Parameters: + session(str): The requests session + request_headers(dict): A dict of request headers. + warc_writer(WARCWriter): a warcio WARC writer object for writing pages to a WARC + urls(list): A list of urls to crawl and add to a WARC. + errors (List): List of Error objects defined in models.py + + Returns: + session(str): The requests session + warc_writer(WARCWriter): a warcio WARC writer object for writing pages to a WARC + url_page_requisites(list): A de-duplicated list page_requisites like CSS and JS that also need to be crawled + errors (List): List of Error objects defined in models.py + """ + url_page_requisites = [] + i = 0 + while i < len(urls): + log.debug("capturing " + urls[i]) + # validate url + if self.validate_url(urls[i], errors): + with capture_http(warc_writer): + # First try with SSL verification. If fails, raise a warning and turn off + try: + r = session.get(urls[i], headers=request_headers) + if r.status_code != 200: + desc = f"When writing WARC derivative, HTTP {r.status_code} {r.reason} for external resource {urls[i]}" + errors = common.handle_error(errors, None, desc, "warn") + if "content-type" in r.headers.keys(): + if "text/html" in r.headers["content-type"]: + # Gotta get these external resources as well + new_soup = BeautifulSoup(r.text, "html.parser") + new_external_urls = self.html_external_resources(new_soup, r.url) + url_page_requisites.extend(new_external_urls) + elif r.headers["content-type"] == "text/css": + new_external_urls = self.css_external_resources(r.text, r.url) + url_page_requisites.extend(new_external_urls) + except Exception as e: + desc = f"Failed to request external URL for WARC derivatives ({urls[i]})" + errors = common.handle_error(errors, e, desc) + i += 1 + return session, warc_writer, list(dict.fromkeys(url_page_requisites)), errors def do_task_per_account(self): log.debug(self.account.account_data()) @@ -187,7 +260,7 @@ def do_task_per_message(self, message): os.makedirs(out_dir) with open(filename, "wb") as output: - writer = WARCWriter(output, gzip=True) + warc_writer = WARCWriter(output, gzip=True) # Write HTML Body try: headers_list = [ @@ -198,7 +271,7 @@ def do_task_per_message(self, message): if message.Date: headers_list.append(("Last-Modified", message.Date)) http_headers = StatusAndHeaders("200 OK", headers_list, protocol="HTTP/1.0") - record = writer.create_warc_record( + record = warc_writer.create_warc_record( f"{warc_uri}/body.html", "response", payload=BytesIO(html_formatted.encode("utf-8")), @@ -206,7 +279,7 @@ def do_task_per_message(self, message): http_headers=http_headers, warc_content_type="text/html", ) - writer.write_record(record) + warc_writer.write_record(record) except Exception as e: desc = "Error creating WARC response record for HTML body" errors = common.handle_error(errors, e, desc) @@ -217,29 +290,17 @@ def do_task_per_message(self, message): request_headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36" } - i = 0 - while i < len(external_urls): - log.debug("capturing " + external_urls[i]) - with capture_http(writer): - # First try with SSL verification. If fails, raise a warning and turn off - try: - r = s.get(external_urls[i], headers=request_headers) - if r.status_code != 200: - desc = f"When writing WARC derivative, HTTP {r.status_code} {r.reason} for external resource {external_urls[i]}" - errors = common.handle_error(errors, None, desc, "warn") - if "content-type" in r.headers.keys(): - if r.headers["content-type"] == "text/html": - # Gotta get these external resources as well - new_soup = BeautifulSoup(r.text, "html.parser") - new_external_urls = self.html_external_resources(new_soup) - external_urls.extend(new_external_urls) - elif r.headers["content-type"] == "text/css": - new_external_urls = self.css_external_resources(r.text, r.url) - external_urls.extend(new_external_urls) - except Exception as e: - desc = f"Failed to request external URL for WARC derivatives ({external_urls[i]})" - errors = common.handle_error(errors, e, desc) - i += 1 + + # Crawl external URLs + s, warc_writer, page_requisites, errors = self.crawl_external_urls( + s, request_headers, warc_writer, external_urls, errors + ) + + # Crawl external URL page requisites + s, warc_writer, new_page_requisites, errors = self.crawl_external_urls( + s, request_headers, warc_writer, page_requisites, errors + ) + except Exception as e: desc = "Error capturing external URL in WARC derivative" errors = common.handle_error(errors, e, desc) @@ -255,7 +316,7 @@ def do_task_per_message(self, message): ("Date", datetime_to_http_date(datetime.now())), ] http_headers = StatusAndHeaders("200 OK", headers_list, protocol="HTTP/1.0") - record = writer.create_warc_record( + record = warc_writer.create_warc_record( f"{warc_uri}/{quote_plus(attachment.WrittenName)}", "response", payload=BytesIO(attachment.File), @@ -263,7 +324,7 @@ def do_task_per_message(self, message): http_headers=http_headers, warc_content_type="text/html", ) - writer.write_record(record) + warc_writer.write_record(record) except Exception as e: desc = "Error adding attachments to WARC derivative" errors = common.handle_error(errors, e, desc) @@ -276,7 +337,7 @@ def do_task_per_message(self, message): ("Content-Length", str(len(headers_json))), ] http_headers = StatusAndHeaders("200 OK", headers_list, protocol="HTTP/1.0") - record = writer.create_warc_record( + record = warc_writer.create_warc_record( f"{warc_uri}/headers.json", "response", payload=BytesIO(headers_json), @@ -284,15 +345,15 @@ def do_task_per_message(self, message): http_headers=http_headers, warc_content_type="application/json", ) - writer.write_record(record) - record = writer.create_warc_record( + warc_writer.write_record(record) + record = warc_writer.create_warc_record( f"{warc_uri}/headers.json", "metadata", payload=BytesIO(headers_json), length=len(headers_json), warc_content_type="application/json", ) - writer.write_record(record) + warc_writer.write_record(record) except Exception as e: desc = "Error creating JSON metadata record to WARC derivative" errors = common.handle_error(errors, e, desc) diff --git a/mailbagit/formats/msg.py b/mailbagit/formats/msg.py index 5a6f246..c12c7c3 100644 --- a/mailbagit/formats/msg.py +++ b/mailbagit/formats/msg.py @@ -92,7 +92,21 @@ def messages(self, iteration_only=False): html_encoding = None text_encoding = None # encoding check priorities - encodings = {1: {"name": "cp1252", "label": "Windows 1252"}, 2: {"name": "utf-8", "label": "utf-8"}} + encodings = {} + """ + The listed values are apparently unreliable for HTML bodies. + Thus with the encodings dict empty, chardet will be used, which is apparently the least bad option. + try: + LIBPFF_ENTRY_TYPE_MESSAGE_BODY_CODEPAGE = int("0x3fde", base=16) + LIBPFF_ENTRY_TYPE_MESSAGE_CODEPAGE = int("0x3ffd", base=16) + message_body_codepage = extract_msg.encoding._CODE_PAGES[mail.getPropertyVal(LIBPFF_ENTRY_TYPE_MESSAGE_BODY_CODEPAGE)] + message_codepage = extract_msg.encoding._CODE_PAGES[mail.getPropertyVal(LIBPFF_ENTRY_TYPE_MESSAGE_CODEPAGE)] + encodings[1] = {"name": message_body_codepage, "label": "PidTagInternetCodepage"} + encodings[2] = {"name": message_codepage, "label": "PidTagMessageCodepage"} + except: + desc = "Error reading codepages" + errors = common.handle_error(errors, e, desc) + """ try: try: if mail.htmlBody: diff --git a/mailbagit/helper/derivative.py b/mailbagit/helper/derivative.py index 46c6d99..61b4761 100644 --- a/mailbagit/helper/derivative.py +++ b/mailbagit/helper/derivative.py @@ -214,15 +214,20 @@ def htmlFormatting(message, external_css, headers=True): # HT to extract_msg for this approach # https://github.com/TeamMsgExtractor/msg-extractor/blob/6bed8213de1a7a41739fcf5c9363322508711fce/extract_msg/message_base.py#L403-L414 tags = (tag for tag in soup.findAll("img") if tag.get("src") and tag.get("src").startswith("cid:")) - data = None for tag in tags: # Iterate through the attachments until we get the right one. + data = None cid = tag["src"][4:] for attachment in message.Attachments: if attachment.Name: if attachment.Name in cid: data = attachment.File + if data == None: + for attachment in message.Attachments: + if attachment.Content_ID: + if attachment.Content_ID in cid: + data = attachment.File # If we found anything, inject it. if data: diff --git a/mailbagit/helper/format.py b/mailbagit/helper/format.py index 45d8022..306da69 100644 --- a/mailbagit/helper/format.py +++ b/mailbagit/helper/format.py @@ -67,7 +67,7 @@ def safely_decode(body_type, binary_text, encodings, errors): try: valid_encoding = codecs.lookup(encodings[priority]["name"]).name.lower() valid.append(valid_encoding) - text = binary_text.decode(valid_encoding) + text = binary_text.decode(valid_encoding, errors="strict") used = encodings[priority]["name"] success = True break @@ -78,7 +78,7 @@ def safely_decode(body_type, binary_text, encodings, errors): if success == False: try: detected = chardet.detect(binary_text)["encoding"] - text = binary_text.decode(detected) + text = binary_text.decode(detected, errors="strict") used = detected if len(valid) < 1: # desc = "No valid listed encodings, but successfully decoded " + body_type + " body with detected encoding " + detected diff --git a/setup.py b/setup.py index bdba6d5..c6ef45c 100755 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setuptools.setup( name="mailbagit", - version="0.7.0", + version="0.7.1", author="Gregory Wiedeman", author_email="gwiedeman@albany.edu", description="A tool for preserving email in multiple preservation formats.",