Skip to content

Commit

Permalink
Merge pull request #244 from UAlbanyArchives/develop
Browse files Browse the repository at this point in the history
MSG encoding and WARC page requisites fixes
  • Loading branch information
gwiedeman authored Feb 7, 2024
2 parents 70bf95a + 808a32e commit c650444
Show file tree
Hide file tree
Showing 6 changed files with 123 additions and 43 deletions.
2 changes: 1 addition & 1 deletion mailbagit/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# __init__.py

# Version of the mailbagit package
__version__ = "0.7.0"
__version__ = "0.7.1"

import os
from pathlib import Path
Expand Down
135 changes: 98 additions & 37 deletions mailbagit/derivatives/warc.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,15 +61,41 @@ def email_external_resources(self, soup):

return external_urls

def html_external_resources(self, soup):
def validate_url(self, url, errors):
"""
Checks if a url is valid and has http/https schema before its requested and raises a warning if invalid or has a different schema.
Parameters:
url(str): A urls found within an email or external html page.
errors (List): List of Error objects defined in models.py
Returns:
errors (List): List of Error objects defined in models.py
"""
try:
result = urllib.parse.urlparse(url)
check = all([result.scheme, result.netloc])
if result.scheme.lower().strip().startswith("http"):
return True
else:
desc = f"When writing WARC derivative, skipping URL with non-http/https schema: {url}"
errors = common.handle_error(errors, None, desc, "warn")
return False
except Exception as e:
desc = f"When writing WARC derivative, skipping invalid URL: {url}"
errors = common.handle_error(errors, None, desc, "warn")
return False

def html_external_resources(self, soup, url):
"""
Reads an HTML body string and looks for all externally-hosted resources
Parameters:
soup(obj): A BeautifulSoup object
url(str): A string of the URL from where the object was requested
Returns:
List: A list of URLs
List: A deduplicated list of URLs
"""
external_urls = []
# not sure if this is comprehensive but something like "for tag in soup.find_all()"
Expand All @@ -92,8 +118,11 @@ def html_external_resources(self, soup):
for tag in soup.findAll(tag):
if tag.get(attr) and tag.get(attr).lower().strip().startswith("http"):
external_urls.append(tag.get(attr))
else:
full_url = urllib.parse.urljoin(url, tag.get(attr))
external_urls.append(full_url)

return external_urls
return list(dict.fromkeys(external_urls))

def css_external_resources(self, cssText, cssURL):
"""
Expand All @@ -104,7 +133,7 @@ def css_external_resources(self, cssText, cssURL):
cssText(str): A string of CSS
Returns:
List: A list of URLs
List: A deduplicated list of URLs
"""

external_urls = []
Expand All @@ -125,7 +154,51 @@ def css_external_resources(self, cssText, cssURL):
else:
external_urls.append(urllib.parse.urljoin(cssURL, url))

return external_urls
return list(dict.fromkeys(external_urls))

def crawl_external_urls(self, session, request_headers, warc_writer, urls, errors):
"""
Reads a list of urls and crawls them and addes them to a WARC file.
Parameters:
session(str): The requests session
request_headers(dict): A dict of request headers.
warc_writer(WARCWriter): a warcio WARC writer object for writing pages to a WARC
urls(list): A list of urls to crawl and add to a WARC.
errors (List): List of Error objects defined in models.py
Returns:
session(str): The requests session
warc_writer(WARCWriter): a warcio WARC writer object for writing pages to a WARC
url_page_requisites(list): A de-duplicated list page_requisites like CSS and JS that also need to be crawled
errors (List): List of Error objects defined in models.py
"""
url_page_requisites = []
i = 0
while i < len(urls):
log.debug("capturing " + urls[i])
# validate url
if self.validate_url(urls[i], errors):
with capture_http(warc_writer):
# First try with SSL verification. If fails, raise a warning and turn off
try:
r = session.get(urls[i], headers=request_headers)
if r.status_code != 200:
desc = f"When writing WARC derivative, HTTP {r.status_code} {r.reason} for external resource {urls[i]}"
errors = common.handle_error(errors, None, desc, "warn")
if "content-type" in r.headers.keys():
if "text/html" in r.headers["content-type"]:
# Gotta get these external resources as well
new_soup = BeautifulSoup(r.text, "html.parser")
new_external_urls = self.html_external_resources(new_soup, r.url)
url_page_requisites.extend(new_external_urls)
elif r.headers["content-type"] == "text/css":
new_external_urls = self.css_external_resources(r.text, r.url)
url_page_requisites.extend(new_external_urls)
except Exception as e:
desc = f"Failed to request external URL for WARC derivatives ({urls[i]})"
errors = common.handle_error(errors, e, desc)
i += 1
return session, warc_writer, list(dict.fromkeys(url_page_requisites)), errors

def do_task_per_account(self):
log.debug(self.account.account_data())
Expand Down Expand Up @@ -187,7 +260,7 @@ def do_task_per_message(self, message):
os.makedirs(out_dir)

with open(filename, "wb") as output:
writer = WARCWriter(output, gzip=True)
warc_writer = WARCWriter(output, gzip=True)
# Write HTML Body
try:
headers_list = [
Expand All @@ -198,15 +271,15 @@ def do_task_per_message(self, message):
if message.Date:
headers_list.append(("Last-Modified", message.Date))
http_headers = StatusAndHeaders("200 OK", headers_list, protocol="HTTP/1.0")
record = writer.create_warc_record(
record = warc_writer.create_warc_record(
f"{warc_uri}/body.html",
"response",
payload=BytesIO(html_formatted.encode("utf-8")),
length=len(html_formatted.encode("utf-8")),
http_headers=http_headers,
warc_content_type="text/html",
)
writer.write_record(record)
warc_writer.write_record(record)
except Exception as e:
desc = "Error creating WARC response record for HTML body"
errors = common.handle_error(errors, e, desc)
Expand All @@ -217,29 +290,17 @@ def do_task_per_message(self, message):
request_headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}
i = 0
while i < len(external_urls):
log.debug("capturing " + external_urls[i])
with capture_http(writer):
# First try with SSL verification. If fails, raise a warning and turn off
try:
r = s.get(external_urls[i], headers=request_headers)
if r.status_code != 200:
desc = f"When writing WARC derivative, HTTP {r.status_code} {r.reason} for external resource {external_urls[i]}"
errors = common.handle_error(errors, None, desc, "warn")
if "content-type" in r.headers.keys():
if r.headers["content-type"] == "text/html":
# Gotta get these external resources as well
new_soup = BeautifulSoup(r.text, "html.parser")
new_external_urls = self.html_external_resources(new_soup)
external_urls.extend(new_external_urls)
elif r.headers["content-type"] == "text/css":
new_external_urls = self.css_external_resources(r.text, r.url)
external_urls.extend(new_external_urls)
except Exception as e:
desc = f"Failed to request external URL for WARC derivatives ({external_urls[i]})"
errors = common.handle_error(errors, e, desc)
i += 1

# Crawl external URLs
s, warc_writer, page_requisites, errors = self.crawl_external_urls(
s, request_headers, warc_writer, external_urls, errors
)

# Crawl external URL page requisites
s, warc_writer, new_page_requisites, errors = self.crawl_external_urls(
s, request_headers, warc_writer, page_requisites, errors
)

except Exception as e:
desc = "Error capturing external URL in WARC derivative"
errors = common.handle_error(errors, e, desc)
Expand All @@ -255,15 +316,15 @@ def do_task_per_message(self, message):
("Date", datetime_to_http_date(datetime.now())),
]
http_headers = StatusAndHeaders("200 OK", headers_list, protocol="HTTP/1.0")
record = writer.create_warc_record(
record = warc_writer.create_warc_record(
f"{warc_uri}/{quote_plus(attachment.WrittenName)}",
"response",
payload=BytesIO(attachment.File),
length=len(attachment.File),
http_headers=http_headers,
warc_content_type="text/html",
)
writer.write_record(record)
warc_writer.write_record(record)
except Exception as e:
desc = "Error adding attachments to WARC derivative"
errors = common.handle_error(errors, e, desc)
Expand All @@ -276,23 +337,23 @@ def do_task_per_message(self, message):
("Content-Length", str(len(headers_json))),
]
http_headers = StatusAndHeaders("200 OK", headers_list, protocol="HTTP/1.0")
record = writer.create_warc_record(
record = warc_writer.create_warc_record(
f"{warc_uri}/headers.json",
"response",
payload=BytesIO(headers_json),
length=len(headers_json),
http_headers=http_headers,
warc_content_type="application/json",
)
writer.write_record(record)
record = writer.create_warc_record(
warc_writer.write_record(record)
record = warc_writer.create_warc_record(
f"{warc_uri}/headers.json",
"metadata",
payload=BytesIO(headers_json),
length=len(headers_json),
warc_content_type="application/json",
)
writer.write_record(record)
warc_writer.write_record(record)
except Exception as e:
desc = "Error creating JSON metadata record to WARC derivative"
errors = common.handle_error(errors, e, desc)
Expand Down
16 changes: 15 additions & 1 deletion mailbagit/formats/msg.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,21 @@ def messages(self, iteration_only=False):
html_encoding = None
text_encoding = None
# encoding check priorities
encodings = {1: {"name": "cp1252", "label": "Windows 1252"}, 2: {"name": "utf-8", "label": "utf-8"}}
encodings = {}
"""
The listed values are apparently unreliable for HTML bodies.
Thus with the encodings dict empty, chardet will be used, which is apparently the least bad option.
try:
LIBPFF_ENTRY_TYPE_MESSAGE_BODY_CODEPAGE = int("0x3fde", base=16)
LIBPFF_ENTRY_TYPE_MESSAGE_CODEPAGE = int("0x3ffd", base=16)
message_body_codepage = extract_msg.encoding._CODE_PAGES[mail.getPropertyVal(LIBPFF_ENTRY_TYPE_MESSAGE_BODY_CODEPAGE)]
message_codepage = extract_msg.encoding._CODE_PAGES[mail.getPropertyVal(LIBPFF_ENTRY_TYPE_MESSAGE_CODEPAGE)]
encodings[1] = {"name": message_body_codepage, "label": "PidTagInternetCodepage"}
encodings[2] = {"name": message_codepage, "label": "PidTagMessageCodepage"}
except:
desc = "Error reading codepages"
errors = common.handle_error(errors, e, desc)
"""
try:
try:
if mail.htmlBody:
Expand Down
7 changes: 6 additions & 1 deletion mailbagit/helper/derivative.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,15 +214,20 @@ def htmlFormatting(message, external_css, headers=True):
# HT to extract_msg for this approach
# https://github.com/TeamMsgExtractor/msg-extractor/blob/6bed8213de1a7a41739fcf5c9363322508711fce/extract_msg/message_base.py#L403-L414
tags = (tag for tag in soup.findAll("img") if tag.get("src") and tag.get("src").startswith("cid:"))
data = None
for tag in tags:
# Iterate through the attachments until we get the right one.
data = None
cid = tag["src"][4:]

for attachment in message.Attachments:
if attachment.Name:
if attachment.Name in cid:
data = attachment.File
if data == None:
for attachment in message.Attachments:
if attachment.Content_ID:
if attachment.Content_ID in cid:
data = attachment.File

# If we found anything, inject it.
if data:
Expand Down
4 changes: 2 additions & 2 deletions mailbagit/helper/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def safely_decode(body_type, binary_text, encodings, errors):
try:
valid_encoding = codecs.lookup(encodings[priority]["name"]).name.lower()
valid.append(valid_encoding)
text = binary_text.decode(valid_encoding)
text = binary_text.decode(valid_encoding, errors="strict")
used = encodings[priority]["name"]
success = True
break
Expand All @@ -78,7 +78,7 @@ def safely_decode(body_type, binary_text, encodings, errors):
if success == False:
try:
detected = chardet.detect(binary_text)["encoding"]
text = binary_text.decode(detected)
text = binary_text.decode(detected, errors="strict")
used = detected
if len(valid) < 1:
# desc = "No valid listed encodings, but successfully decoded " + body_type + " body with detected encoding " + detected
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

setuptools.setup(
name="mailbagit",
version="0.7.0",
version="0.7.1",
author="Gregory Wiedeman",
author_email="[email protected]",
description="A tool for preserving email in multiple preservation formats.",
Expand Down

0 comments on commit c650444

Please sign in to comment.