Merge pull request #72 from jishac/encoding_edge_cases
Encoding: edge cases and error handling
This commit is contained in:
commit
fe443e6bd3
@ -841,6 +841,76 @@ class BaseFolder:
|
||||
(uid, self.accountname))
|
||||
raise # Raise on unknown errors, so we can fix those.
|
||||
|
||||
def _extract_message_id(self, raw_msg_bytes):
|
||||
"""Extract the Message-ID from a bytes object containing a raw message.
|
||||
|
||||
This function attempts to find the Message-ID for a message that has not
|
||||
been processed by the built-in email library, and is therefore NOT an
|
||||
email object. If parsing the message fails (or is otherwise not
|
||||
needed), this utility can be useful to help provide a (hopefully) unique
|
||||
identifier in log messages to facilitate locating the message on disk.
|
||||
|
||||
:param raw_msg_bytes: bytes object containing the raw email message.
|
||||
:returns: A tuple containing the contents of the Message-ID header if
|
||||
found (or <Unknown Message-ID> if not found) and a flag which is True if
|
||||
the Message-ID was in proper RFC format or False if it contained
|
||||
defects.
|
||||
"""
|
||||
msg_header = re.split(b'[\r]?\n[\r]?\n', raw_msg_bytes)[0]
|
||||
try:
|
||||
msg_id = re.search(b"\nmessage-id:[\s]+(<[A-Za-z0-9!#$%&'*+-/=?^_`{}|~.@ ]+>)",
|
||||
msg_header, re.IGNORECASE).group(1)
|
||||
except AttributeError:
|
||||
# No match - Likely not following RFC rules. Try and find anything
|
||||
# that looks like it could be the Message-ID but flag it.
|
||||
_start_pos = msg_header.find(b'\nMessage-ID:')
|
||||
if _start_pos > 0:
|
||||
_end_pos = msg_header.find(b'\n',_start_pos+15)
|
||||
msg_id = msg_header[_start_pos+12:_end_pos].strip()
|
||||
return (msg_id, False)
|
||||
else:
|
||||
return (b"<Unknown Message-ID>", False)
|
||||
return (msg_id, True)
|
||||
|
||||
def _quote_boundary_fix(self, raw_msg_bytes):
|
||||
"""Modify a raw message to quote the boundary separator for multipart messages.
|
||||
|
||||
This function quotes only the first occurrence of the boundary field in
|
||||
the email header, and quotes any boundary value. Improperly quoted
|
||||
boundary fields can give the internal python email library issues.
|
||||
|
||||
:param raw_msg_bytes: bytes object containing the raw email message.
|
||||
:returns: The raw byte stream containing the quoted boundary
|
||||
"""
|
||||
# Use re.split to extract just the header, and search for the boundary in
|
||||
# the context-type header and extract just the boundary and characters per
|
||||
# RFC 2046 ( see https://tools.ietf.org/html/rfc2046#section-5.1.1 )
|
||||
# We don't cap the length to 70 characters, because we are just trying to
|
||||
# soft fix this message to resolve the python library looking for properly
|
||||
# quoted boundaries.
|
||||
try: boundary_field = \
|
||||
re.search(b"content-type:.*(boundary=[\"]?[A-Za-z0-9'()+_,-./:=? ]+[\"]?)",
|
||||
re.split(b'[\r]?\n[\r]?\n', raw_msg_bytes)[0],
|
||||
(re.IGNORECASE|re.DOTALL)).group(1)
|
||||
except AttributeError:
|
||||
# No match
|
||||
return raw_msg_bytes
|
||||
# get the boundary field, and strip off any trailing ws (against RFC rules, leading ws is OK)
|
||||
# if it was already quoted, well then there was nothing to fix
|
||||
boundary, value = boundary_field.split(b'=', 1)
|
||||
value = value.rstrip()
|
||||
# ord(b'"') == 34
|
||||
if value[0] == value[-1] == 34:
|
||||
# Sanity Check - Do not requote if already quoted.
|
||||
# A quoted boundary was the end goal so return the original
|
||||
#
|
||||
# No need to worry about if the original email did something like:
|
||||
# boundary="ahahah " as the email library will trim the ws for us
|
||||
return raw_msg_bytes
|
||||
else:
|
||||
new_field = b''.join([boundary, b'="', value, b'"'])
|
||||
return(raw_msg_bytes.replace(boundary_field, new_field, 1))
|
||||
|
||||
def __syncmessagesto_copy(self, dstfolder, statusfolder):
|
||||
"""Pass1: Copy locally existing messages not on the other side.
|
||||
|
||||
|
@ -24,6 +24,7 @@ from offlineimap import imaputil, imaplibutil, OfflineImapError
|
||||
from offlineimap import globals
|
||||
from imaplib2 import MonthNames
|
||||
from .Base import BaseFolder
|
||||
from email.errors import NoBoundaryInMultipartDefect
|
||||
|
||||
# Globals
|
||||
CRLF = '\r\n'
|
||||
@ -735,10 +736,8 @@ class IMAPFolder(BaseFolder):
|
||||
raise OfflineImapError(
|
||||
"Saving msg (%s) in folder '%s', "
|
||||
"repository '%s' failed (abort). "
|
||||
"Server responded: %s\n"
|
||||
"Message content was: %s" %
|
||||
(msg_id, self, self.getrepository(),
|
||||
str(e), dbg_output),
|
||||
"Server responded: %s\n" %
|
||||
(msg_id, self, self.getrepository(), str(e)),
|
||||
OfflineImapError.ERROR.MESSAGE,
|
||||
exc_info()[2])
|
||||
|
||||
@ -752,10 +751,8 @@ class IMAPFolder(BaseFolder):
|
||||
imapobj = None
|
||||
raise OfflineImapError(
|
||||
"Saving msg (%s) folder '%s', repo '%s'"
|
||||
"failed (error). Server responded: %s\n"
|
||||
"Message content was: %s" %
|
||||
(msg_id, self, self.getrepository(),
|
||||
str(e), dbg_output),
|
||||
"failed (error). Server responded: %s\n" %
|
||||
(msg_id, self, self.getrepository(), str(e)),
|
||||
OfflineImapError.ERROR.MESSAGE,
|
||||
exc_info()[2])
|
||||
|
||||
@ -905,7 +902,35 @@ class IMAPFolder(BaseFolder):
|
||||
# Convert email, d[0][1], into a message object (from bytes)
|
||||
|
||||
ndata0 = data[0][0].decode('utf-8')
|
||||
ndata1 = self.parser['8bit-RFC'].parsebytes(data[0][1])
|
||||
try: ndata1 = self.parser['8bit-RFC'].parsebytes(data[0][1])
|
||||
except:
|
||||
err = exc_info()
|
||||
response_type = type(data[0][1]).__name__
|
||||
msg_id = self._extract_message_id(data[0][1])[0].decode('ascii',errors='surrogateescape')
|
||||
raise OfflineImapError(
|
||||
"Exception parsing message with ID ({}) from imaplib (response type: {}).\n {}: {}".format(
|
||||
msg_id, response_type, err[0].__name__, err[1]),
|
||||
OfflineImapError.ERROR.MESSAGE)
|
||||
if len(ndata1.defects) > 0:
|
||||
# We don't automatically apply fixes as to attempt to preserve the original message
|
||||
self.ui.warn("UID {} has defects: {}".format(uids, ndata1.defects))
|
||||
if any(isinstance(defect, NoBoundaryInMultipartDefect) for defect in ndata1.defects):
|
||||
# (Hopefully) Rare defect from a broken client where multipart boundary is
|
||||
# not properly quoted. Attempt to solve by fixing the boundary and parsing
|
||||
self.ui.warn(" ... applying multipart boundary fix.")
|
||||
ndata1 = self.parser['8bit-RFC'].parsebytes(self._quote_boundary_fix(data[0][1]))
|
||||
try:
|
||||
# See if the defects after fixes are preventing us from obtaining bytes
|
||||
_ = ndata1.as_bytes(policy=self.policy['8bit-RFC'])
|
||||
except UnicodeEncodeError as err:
|
||||
# Unknown issue which is causing failure of as_bytes()
|
||||
msg_id = self.getmessageheader(ndata1, "message-id")
|
||||
if msg_id is None:
|
||||
msg_id = '<Unknown Message-ID>'
|
||||
raise OfflineImapError(
|
||||
"UID {} ({}) has defects preventing it from being processed!\n {}: {}".format(
|
||||
uids, msg_id, type(err).__name__, err),
|
||||
OfflineImapError.ERROR.MESSAGE)
|
||||
ndata = [ndata0, ndata1]
|
||||
|
||||
return ndata
|
||||
|
@ -25,6 +25,7 @@ from threading import Lock
|
||||
from hashlib import md5
|
||||
from offlineimap import OfflineImapError
|
||||
from .Base import BaseFolder
|
||||
from email.errors import NoBoundaryInMultipartDefect
|
||||
|
||||
# Find the UID in a message filename
|
||||
re_uidmatch = re.compile(',U=(\d+)')
|
||||
@ -259,8 +260,36 @@ class MaildirFolder(BaseFolder):
|
||||
filename = self.messagelist[uid]['filename']
|
||||
filepath = os.path.join(self.getfullname(), filename)
|
||||
fd = open(filepath, 'rb')
|
||||
retval = self.parser['8bit'].parse(fd)
|
||||
_fd_bytes = fd.read()
|
||||
fd.close()
|
||||
try: retval = self.parser['8bit'].parsebytes(_fd_bytes)
|
||||
except:
|
||||
err = exc_info()
|
||||
msg_id = self._extract_message_id(_fd_bytes)[0].decode('ascii',errors='surrogateescape')
|
||||
raise OfflineImapError(
|
||||
"Exception parsing message with ID ({}) from file ({}).\n {}: {}".format(
|
||||
msg_id, filename, err[0].__name__, err[1]),
|
||||
OfflineImapError.ERROR.MESSAGE)
|
||||
if len(retval.defects) > 0:
|
||||
# We don't automatically apply fixes as to attempt to preserve the original message
|
||||
self.ui.warn("UID {} has defects: {}".format(uid, retval.defects))
|
||||
if any(isinstance(defect, NoBoundaryInMultipartDefect) for defect in retval.defects):
|
||||
# (Hopefully) Rare defect from a broken client where multipart boundary is
|
||||
# not properly quoted. Attempt to solve by fixing the boundary and parsing
|
||||
self.ui.warn(" ... applying multipart boundary fix.")
|
||||
retval = self.parser['8bit'].parsebytes(self._quote_boundary_fix(_fd_bytes))
|
||||
try:
|
||||
# See if the defects after fixes are preventing us from obtaining bytes
|
||||
_ = retval.as_bytes(policy=self.policy['8bit'])
|
||||
except UnicodeEncodeError as err:
|
||||
# Unknown issue which is causing failure of as_bytes()
|
||||
msg_id = self.getmessageheader(retval, "message-id")
|
||||
if msg_id is None:
|
||||
msg_id = '<unknown-message-id>'
|
||||
raise OfflineImapError(
|
||||
"UID {} ({}) has defects preventing it from being processed!\n {}: {}".format(
|
||||
uid, msg_id, type(err).__name__, err),
|
||||
OfflineImapError.ERROR.MESSAGE)
|
||||
return retval
|
||||
|
||||
# Interface from BaseFolder
|
||||
|
Loading…
Reference in New Issue
Block a user