Merge pull request #72 from jishac/encoding_edge_cases

Encoding: edge cases and error handling
This commit is contained in:
Rodolfo García Peñas (kix) 2021-06-09 06:03:54 +02:00 committed by GitHub
commit fe443e6bd3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 134 additions and 10 deletions

View File

@ -841,6 +841,76 @@ class BaseFolder:
(uid, self.accountname)) (uid, self.accountname))
raise # Raise on unknown errors, so we can fix those. raise # Raise on unknown errors, so we can fix those.
def _extract_message_id(self, raw_msg_bytes):
"""Extract the Message-ID from a bytes object containing a raw message.
This function attempts to find the Message-ID for a message that has not
been processed by the built-in email library, and is therefore NOT an
email object. If parsing the message fails (or is otherwise not
needed), this utility can be useful to help provide a (hopefully) unique
identifier in log messages to facilitate locating the message on disk.
:param raw_msg_bytes: bytes object containing the raw email message.
:returns: A tuple containing the contents of the Message-ID header if
found (or <Unknown Message-ID> if not found) and a flag which is True if
the Message-ID was in proper RFC format or False if it contained
defects.
"""
msg_header = re.split(b'[\r]?\n[\r]?\n', raw_msg_bytes)[0]
try:
msg_id = re.search(b"\nmessage-id:[\s]+(<[A-Za-z0-9!#$%&'*+-/=?^_`{}|~.@ ]+>)",
msg_header, re.IGNORECASE).group(1)
except AttributeError:
# No match - Likely not following RFC rules. Try and find anything
# that looks like it could be the Message-ID but flag it.
_start_pos = msg_header.find(b'\nMessage-ID:')
if _start_pos > 0:
_end_pos = msg_header.find(b'\n',_start_pos+15)
msg_id = msg_header[_start_pos+12:_end_pos].strip()
return (msg_id, False)
else:
return (b"<Unknown Message-ID>", False)
return (msg_id, True)
def _quote_boundary_fix(self, raw_msg_bytes):
"""Modify a raw message to quote the boundary separator for multipart messages.
This function quotes only the first occurrence of the boundary field in
the email header, and quotes any boundary value. Improperly quoted
boundary fields can give the internal python email library issues.
:param raw_msg_bytes: bytes object containing the raw email message.
:returns: The raw byte stream containing the quoted boundary
"""
# Use re.split to extract just the header, and search for the boundary in
# the context-type header and extract just the boundary and characters per
# RFC 2046 ( see https://tools.ietf.org/html/rfc2046#section-5.1.1 )
# We don't cap the length to 70 characters, because we are just trying to
# soft fix this message to resolve the python library looking for properly
# quoted boundaries.
try: boundary_field = \
re.search(b"content-type:.*(boundary=[\"]?[A-Za-z0-9'()+_,-./:=? ]+[\"]?)",
re.split(b'[\r]?\n[\r]?\n', raw_msg_bytes)[0],
(re.IGNORECASE|re.DOTALL)).group(1)
except AttributeError:
# No match
return raw_msg_bytes
# get the boundary field, and strip off any trailing ws (against RFC rules, leading ws is OK)
# if it was already quoted, well then there was nothing to fix
boundary, value = boundary_field.split(b'=', 1)
value = value.rstrip()
# ord(b'"') == 34
if value[0] == value[-1] == 34:
# Sanity Check - Do not requote if already quoted.
# A quoted boundary was the end goal so return the original
#
# No need to worry about if the original email did something like:
# boundary="ahahah " as the email library will trim the ws for us
return raw_msg_bytes
else:
new_field = b''.join([boundary, b'="', value, b'"'])
return(raw_msg_bytes.replace(boundary_field, new_field, 1))
def __syncmessagesto_copy(self, dstfolder, statusfolder): def __syncmessagesto_copy(self, dstfolder, statusfolder):
"""Pass1: Copy locally existing messages not on the other side. """Pass1: Copy locally existing messages not on the other side.

View File

@ -24,6 +24,7 @@ from offlineimap import imaputil, imaplibutil, OfflineImapError
from offlineimap import globals from offlineimap import globals
from imaplib2 import MonthNames from imaplib2 import MonthNames
from .Base import BaseFolder from .Base import BaseFolder
from email.errors import NoBoundaryInMultipartDefect
# Globals # Globals
CRLF = '\r\n' CRLF = '\r\n'
@ -735,10 +736,8 @@ class IMAPFolder(BaseFolder):
raise OfflineImapError( raise OfflineImapError(
"Saving msg (%s) in folder '%s', " "Saving msg (%s) in folder '%s', "
"repository '%s' failed (abort). " "repository '%s' failed (abort). "
"Server responded: %s\n" "Server responded: %s\n" %
"Message content was: %s" % (msg_id, self, self.getrepository(), str(e)),
(msg_id, self, self.getrepository(),
str(e), dbg_output),
OfflineImapError.ERROR.MESSAGE, OfflineImapError.ERROR.MESSAGE,
exc_info()[2]) exc_info()[2])
@ -752,10 +751,8 @@ class IMAPFolder(BaseFolder):
imapobj = None imapobj = None
raise OfflineImapError( raise OfflineImapError(
"Saving msg (%s) folder '%s', repo '%s'" "Saving msg (%s) folder '%s', repo '%s'"
"failed (error). Server responded: %s\n" "failed (error). Server responded: %s\n" %
"Message content was: %s" % (msg_id, self, self.getrepository(), str(e)),
(msg_id, self, self.getrepository(),
str(e), dbg_output),
OfflineImapError.ERROR.MESSAGE, OfflineImapError.ERROR.MESSAGE,
exc_info()[2]) exc_info()[2])
@ -905,7 +902,35 @@ class IMAPFolder(BaseFolder):
# Convert email, d[0][1], into a message object (from bytes) # Convert email, d[0][1], into a message object (from bytes)
ndata0 = data[0][0].decode('utf-8') ndata0 = data[0][0].decode('utf-8')
ndata1 = self.parser['8bit-RFC'].parsebytes(data[0][1]) try: ndata1 = self.parser['8bit-RFC'].parsebytes(data[0][1])
except:
err = exc_info()
response_type = type(data[0][1]).__name__
msg_id = self._extract_message_id(data[0][1])[0].decode('ascii',errors='surrogateescape')
raise OfflineImapError(
"Exception parsing message with ID ({}) from imaplib (response type: {}).\n {}: {}".format(
msg_id, response_type, err[0].__name__, err[1]),
OfflineImapError.ERROR.MESSAGE)
if len(ndata1.defects) > 0:
# We don't automatically apply fixes as to attempt to preserve the original message
self.ui.warn("UID {} has defects: {}".format(uids, ndata1.defects))
if any(isinstance(defect, NoBoundaryInMultipartDefect) for defect in ndata1.defects):
# (Hopefully) Rare defect from a broken client where multipart boundary is
# not properly quoted. Attempt to solve by fixing the boundary and parsing
self.ui.warn(" ... applying multipart boundary fix.")
ndata1 = self.parser['8bit-RFC'].parsebytes(self._quote_boundary_fix(data[0][1]))
try:
# See if the defects after fixes are preventing us from obtaining bytes
_ = ndata1.as_bytes(policy=self.policy['8bit-RFC'])
except UnicodeEncodeError as err:
# Unknown issue which is causing failure of as_bytes()
msg_id = self.getmessageheader(ndata1, "message-id")
if msg_id is None:
msg_id = '<Unknown Message-ID>'
raise OfflineImapError(
"UID {} ({}) has defects preventing it from being processed!\n {}: {}".format(
uids, msg_id, type(err).__name__, err),
OfflineImapError.ERROR.MESSAGE)
ndata = [ndata0, ndata1] ndata = [ndata0, ndata1]
return ndata return ndata

View File

@ -25,6 +25,7 @@ from threading import Lock
from hashlib import md5 from hashlib import md5
from offlineimap import OfflineImapError from offlineimap import OfflineImapError
from .Base import BaseFolder from .Base import BaseFolder
from email.errors import NoBoundaryInMultipartDefect
# Find the UID in a message filename # Find the UID in a message filename
re_uidmatch = re.compile(',U=(\d+)') re_uidmatch = re.compile(',U=(\d+)')
@ -259,8 +260,36 @@ class MaildirFolder(BaseFolder):
filename = self.messagelist[uid]['filename'] filename = self.messagelist[uid]['filename']
filepath = os.path.join(self.getfullname(), filename) filepath = os.path.join(self.getfullname(), filename)
fd = open(filepath, 'rb') fd = open(filepath, 'rb')
retval = self.parser['8bit'].parse(fd) _fd_bytes = fd.read()
fd.close() fd.close()
try: retval = self.parser['8bit'].parsebytes(_fd_bytes)
except:
err = exc_info()
msg_id = self._extract_message_id(_fd_bytes)[0].decode('ascii',errors='surrogateescape')
raise OfflineImapError(
"Exception parsing message with ID ({}) from file ({}).\n {}: {}".format(
msg_id, filename, err[0].__name__, err[1]),
OfflineImapError.ERROR.MESSAGE)
if len(retval.defects) > 0:
# We don't automatically apply fixes as to attempt to preserve the original message
self.ui.warn("UID {} has defects: {}".format(uid, retval.defects))
if any(isinstance(defect, NoBoundaryInMultipartDefect) for defect in retval.defects):
# (Hopefully) Rare defect from a broken client where multipart boundary is
# not properly quoted. Attempt to solve by fixing the boundary and parsing
self.ui.warn(" ... applying multipart boundary fix.")
retval = self.parser['8bit'].parsebytes(self._quote_boundary_fix(_fd_bytes))
try:
# See if the defects after fixes are preventing us from obtaining bytes
_ = retval.as_bytes(policy=self.policy['8bit'])
except UnicodeEncodeError as err:
# Unknown issue which is causing failure of as_bytes()
msg_id = self.getmessageheader(retval, "message-id")
if msg_id is None:
msg_id = '<unknown-message-id>'
raise OfflineImapError(
"UID {} ({}) has defects preventing it from being processed!\n {}: {}".format(
uid, msg_id, type(err).__name__, err),
OfflineImapError.ERROR.MESSAGE)
return retval return retval
# Interface from BaseFolder # Interface from BaseFolder