Adding a handler to detect the unlikely edge case where a message may

have an improperly quoted boundary that can cause the python library to
fail to reproduce the original message with msg.as_bytes().  See:
https://bugs.python.org/issue43818 and
https://github.com/OfflineIMAP/offlineimap3/issues/62
This commit is contained in:
Joseph Ishac 2021-04-13 00:01:26 -04:00
parent 6a45eef3b5
commit b78af75064
2 changed files with 51 additions and 0 deletions

View File

@ -26,6 +26,7 @@ from email import policy
from email.parser import BytesParser
from email.generator import BytesGenerator
from email.utils import parsedate_tz, mktime_tz
from email.errors import NoBoundaryInMultipartDefect
from offlineimap import threadutil
from offlineimap.ui import getglobalui

View File

@ -225,6 +225,44 @@ class MaildirFolder(BaseFolder):
retval[uid] = date_excludees[uid]
return retval
def _quote_boundary_fix(self, raw_msg_bytes):
"""Modify a raw message to quote the boundary separator for multipart messages.
This function quotes only the first occurrence of the boundary field in
the email header, and quotes any boundary value. Improperly quoted
boundary fields can give the internal python email library issues.
:returns: The raw byte stream containing the quoted boundary
"""
# Use re.split to extract just the header, and search for the boundary in
# the context-type header and extract just the boundary and characters per
# RFC 2046 ( see https://tools.ietf.org/html/rfc2046#section-5.1.1 )
# We don't cap the length to 70 characters, because we are just trying to
# soft fix this message to resolve the python library looking for properly
# quoted boundaries.
try: boundary_field = \
re.search(b"content-type:.*(boundary=[\"]?[A-Za-z0-9'()+_,-./:=? ]*[\"]?)",
re.split(b'[\r]?\n[\r]?\n',raw_msg_bytes)[0],re.IGNORECASE).group(1)
except AttributeError:
# No match
return raw_msg_bytes
# get the boundary field, and strip off any trailing ws (against RFC rules, leading ws is OK)
# if it was already quoted, well then there was nothing to fix
boundary, value = boundary_field.split(b'=',1)
value = value.rstrip()
# ord(b'"') == 34
if value[0] == value[-1] == 34:
# Sanity Check - Do not requote if already quoted.
# A quoted boundary was the end goal so return the original
#
# No need to worry about if the original email did something like:
# boundary="ahahah " as the email library will trim the ws for us
return raw_msg_bytes
else:
new_field = b''.join([boundary,b'="',value,b'"'])
return(raw_msg_bytes.replace(boundary_field,new_field,1))
# Interface from BaseFolder
def quickchanged(self, statusfolder):
"""Returns True if the Maildir has changed
@ -260,6 +298,18 @@ class MaildirFolder(BaseFolder):
filepath = os.path.join(self.getfullname(), filename)
fd = open(filepath, 'rb')
retval = self.parser['8bit'].parse(fd)
try:
_ = retval.as_bytes(policy=self.policy['8bit'])
except UnicodeEncodeError as err:
if any(isinstance(defect, NoBoundaryInMultipartDefect) for defect in retval.defects):
# (Hopefully) Rare instance where multipart boundary is not
# properly quoted. Solve by fixing the boundary and parsing
fd.seek(0)
_buffer = fd.read()
retval = self.parser['8bit'].parsebytes(_quote_boundary_fix(_buffer))
else:
# Unknown issue which is causing failure of as_bytes()
ui.warn("Message has defects preventing it from being processed!")
fd.close()
return retval