From b78af7506442878c51c1e82f1ef36149539d06de Mon Sep 17 00:00:00 2001 From: Joseph Ishac Date: Tue, 13 Apr 2021 00:01:26 -0400 Subject: [PATCH] Adding a handler to detect the unlikely edge case where a message may have an improperly quoted boundary that can cause the python library to fail to reproduce the original message with msg.as_bytes(). See: https://bugs.python.org/issue43818 and https://github.com/OfflineIMAP/offlineimap3/issues/62 --- offlineimap/folder/Base.py | 1 + offlineimap/folder/Maildir.py | 50 +++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/offlineimap/folder/Base.py b/offlineimap/folder/Base.py index f757e75..5f07960 100644 --- a/offlineimap/folder/Base.py +++ b/offlineimap/folder/Base.py @@ -26,6 +26,7 @@ from email import policy from email.parser import BytesParser from email.generator import BytesGenerator from email.utils import parsedate_tz, mktime_tz +from email.errors import NoBoundaryInMultipartDefect from offlineimap import threadutil from offlineimap.ui import getglobalui diff --git a/offlineimap/folder/Maildir.py b/offlineimap/folder/Maildir.py index 74a4ab2..827d8ce 100644 --- a/offlineimap/folder/Maildir.py +++ b/offlineimap/folder/Maildir.py @@ -225,6 +225,44 @@ class MaildirFolder(BaseFolder): retval[uid] = date_excludees[uid] return retval + def _quote_boundary_fix(self, raw_msg_bytes): + """Modify a raw message to quote the boundary separator for multipart messages. + + This function quotes only the first occurrence of the boundary field in + the email header, and quotes any boundary value. Improperly quoted + boundary fields can give the internal python email library issues. + + :returns: The raw byte stream containing the quoted boundary + """ + # Use re.split to extract just the header, and search for the boundary in + # the context-type header and extract just the boundary and characters per + # RFC 2046 ( see https://tools.ietf.org/html/rfc2046#section-5.1.1 ) + # We don't cap the length to 70 characters, because we are just trying to + # soft fix this message to resolve the python library looking for properly + # quoted boundaries. + try: boundary_field = \ + re.search(b"content-type:.*(boundary=[\"]?[A-Za-z0-9'()+_,-./:=? ]*[\"]?)", + re.split(b'[\r]?\n[\r]?\n',raw_msg_bytes)[0],re.IGNORECASE).group(1) + except AttributeError: + # No match + return raw_msg_bytes + # get the boundary field, and strip off any trailing ws (against RFC rules, leading ws is OK) + # if it was already quoted, well then there was nothing to fix + boundary, value = boundary_field.split(b'=',1) + value = value.rstrip() + # ord(b'"') == 34 + if value[0] == value[-1] == 34: + # Sanity Check - Do not requote if already quoted. + # A quoted boundary was the end goal so return the original + # + # No need to worry about if the original email did something like: + # boundary="ahahah " as the email library will trim the ws for us + return raw_msg_bytes + else: + new_field = b''.join([boundary,b'="',value,b'"']) + return(raw_msg_bytes.replace(boundary_field,new_field,1)) + + # Interface from BaseFolder def quickchanged(self, statusfolder): """Returns True if the Maildir has changed @@ -260,6 +298,18 @@ class MaildirFolder(BaseFolder): filepath = os.path.join(self.getfullname(), filename) fd = open(filepath, 'rb') retval = self.parser['8bit'].parse(fd) + try: + _ = retval.as_bytes(policy=self.policy['8bit']) + except UnicodeEncodeError as err: + if any(isinstance(defect, NoBoundaryInMultipartDefect) for defect in retval.defects): + # (Hopefully) Rare instance where multipart boundary is not + # properly quoted. Solve by fixing the boundary and parsing + fd.seek(0) + _buffer = fd.read() + retval = self.parser['8bit'].parsebytes(_quote_boundary_fix(_buffer)) + else: + # Unknown issue which is causing failure of as_bytes() + ui.warn("Message has defects preventing it from being processed!") fd.close() return retval