Merge pull request #72 from jishac/encoding_edge_cases
Encoding: edge cases and error handling
This commit is contained in:
commit
fe443e6bd3
@ -841,6 +841,76 @@ class BaseFolder:
|
|||||||
(uid, self.accountname))
|
(uid, self.accountname))
|
||||||
raise # Raise on unknown errors, so we can fix those.
|
raise # Raise on unknown errors, so we can fix those.
|
||||||
|
|
||||||
|
def _extract_message_id(self, raw_msg_bytes):
|
||||||
|
"""Extract the Message-ID from a bytes object containing a raw message.
|
||||||
|
|
||||||
|
This function attempts to find the Message-ID for a message that has not
|
||||||
|
been processed by the built-in email library, and is therefore NOT an
|
||||||
|
email object. If parsing the message fails (or is otherwise not
|
||||||
|
needed), this utility can be useful to help provide a (hopefully) unique
|
||||||
|
identifier in log messages to facilitate locating the message on disk.
|
||||||
|
|
||||||
|
:param raw_msg_bytes: bytes object containing the raw email message.
|
||||||
|
:returns: A tuple containing the contents of the Message-ID header if
|
||||||
|
found (or <Unknown Message-ID> if not found) and a flag which is True if
|
||||||
|
the Message-ID was in proper RFC format or False if it contained
|
||||||
|
defects.
|
||||||
|
"""
|
||||||
|
msg_header = re.split(b'[\r]?\n[\r]?\n', raw_msg_bytes)[0]
|
||||||
|
try:
|
||||||
|
msg_id = re.search(b"\nmessage-id:[\s]+(<[A-Za-z0-9!#$%&'*+-/=?^_`{}|~.@ ]+>)",
|
||||||
|
msg_header, re.IGNORECASE).group(1)
|
||||||
|
except AttributeError:
|
||||||
|
# No match - Likely not following RFC rules. Try and find anything
|
||||||
|
# that looks like it could be the Message-ID but flag it.
|
||||||
|
_start_pos = msg_header.find(b'\nMessage-ID:')
|
||||||
|
if _start_pos > 0:
|
||||||
|
_end_pos = msg_header.find(b'\n',_start_pos+15)
|
||||||
|
msg_id = msg_header[_start_pos+12:_end_pos].strip()
|
||||||
|
return (msg_id, False)
|
||||||
|
else:
|
||||||
|
return (b"<Unknown Message-ID>", False)
|
||||||
|
return (msg_id, True)
|
||||||
|
|
||||||
|
def _quote_boundary_fix(self, raw_msg_bytes):
|
||||||
|
"""Modify a raw message to quote the boundary separator for multipart messages.
|
||||||
|
|
||||||
|
This function quotes only the first occurrence of the boundary field in
|
||||||
|
the email header, and quotes any boundary value. Improperly quoted
|
||||||
|
boundary fields can give the internal python email library issues.
|
||||||
|
|
||||||
|
:param raw_msg_bytes: bytes object containing the raw email message.
|
||||||
|
:returns: The raw byte stream containing the quoted boundary
|
||||||
|
"""
|
||||||
|
# Use re.split to extract just the header, and search for the boundary in
|
||||||
|
# the context-type header and extract just the boundary and characters per
|
||||||
|
# RFC 2046 ( see https://tools.ietf.org/html/rfc2046#section-5.1.1 )
|
||||||
|
# We don't cap the length to 70 characters, because we are just trying to
|
||||||
|
# soft fix this message to resolve the python library looking for properly
|
||||||
|
# quoted boundaries.
|
||||||
|
try: boundary_field = \
|
||||||
|
re.search(b"content-type:.*(boundary=[\"]?[A-Za-z0-9'()+_,-./:=? ]+[\"]?)",
|
||||||
|
re.split(b'[\r]?\n[\r]?\n', raw_msg_bytes)[0],
|
||||||
|
(re.IGNORECASE|re.DOTALL)).group(1)
|
||||||
|
except AttributeError:
|
||||||
|
# No match
|
||||||
|
return raw_msg_bytes
|
||||||
|
# get the boundary field, and strip off any trailing ws (against RFC rules, leading ws is OK)
|
||||||
|
# if it was already quoted, well then there was nothing to fix
|
||||||
|
boundary, value = boundary_field.split(b'=', 1)
|
||||||
|
value = value.rstrip()
|
||||||
|
# ord(b'"') == 34
|
||||||
|
if value[0] == value[-1] == 34:
|
||||||
|
# Sanity Check - Do not requote if already quoted.
|
||||||
|
# A quoted boundary was the end goal so return the original
|
||||||
|
#
|
||||||
|
# No need to worry about if the original email did something like:
|
||||||
|
# boundary="ahahah " as the email library will trim the ws for us
|
||||||
|
return raw_msg_bytes
|
||||||
|
else:
|
||||||
|
new_field = b''.join([boundary, b'="', value, b'"'])
|
||||||
|
return(raw_msg_bytes.replace(boundary_field, new_field, 1))
|
||||||
|
|
||||||
def __syncmessagesto_copy(self, dstfolder, statusfolder):
|
def __syncmessagesto_copy(self, dstfolder, statusfolder):
|
||||||
"""Pass1: Copy locally existing messages not on the other side.
|
"""Pass1: Copy locally existing messages not on the other side.
|
||||||
|
|
||||||
|
@ -24,6 +24,7 @@ from offlineimap import imaputil, imaplibutil, OfflineImapError
|
|||||||
from offlineimap import globals
|
from offlineimap import globals
|
||||||
from imaplib2 import MonthNames
|
from imaplib2 import MonthNames
|
||||||
from .Base import BaseFolder
|
from .Base import BaseFolder
|
||||||
|
from email.errors import NoBoundaryInMultipartDefect
|
||||||
|
|
||||||
# Globals
|
# Globals
|
||||||
CRLF = '\r\n'
|
CRLF = '\r\n'
|
||||||
@ -735,10 +736,8 @@ class IMAPFolder(BaseFolder):
|
|||||||
raise OfflineImapError(
|
raise OfflineImapError(
|
||||||
"Saving msg (%s) in folder '%s', "
|
"Saving msg (%s) in folder '%s', "
|
||||||
"repository '%s' failed (abort). "
|
"repository '%s' failed (abort). "
|
||||||
"Server responded: %s\n"
|
"Server responded: %s\n" %
|
||||||
"Message content was: %s" %
|
(msg_id, self, self.getrepository(), str(e)),
|
||||||
(msg_id, self, self.getrepository(),
|
|
||||||
str(e), dbg_output),
|
|
||||||
OfflineImapError.ERROR.MESSAGE,
|
OfflineImapError.ERROR.MESSAGE,
|
||||||
exc_info()[2])
|
exc_info()[2])
|
||||||
|
|
||||||
@ -752,10 +751,8 @@ class IMAPFolder(BaseFolder):
|
|||||||
imapobj = None
|
imapobj = None
|
||||||
raise OfflineImapError(
|
raise OfflineImapError(
|
||||||
"Saving msg (%s) folder '%s', repo '%s'"
|
"Saving msg (%s) folder '%s', repo '%s'"
|
||||||
"failed (error). Server responded: %s\n"
|
"failed (error). Server responded: %s\n" %
|
||||||
"Message content was: %s" %
|
(msg_id, self, self.getrepository(), str(e)),
|
||||||
(msg_id, self, self.getrepository(),
|
|
||||||
str(e), dbg_output),
|
|
||||||
OfflineImapError.ERROR.MESSAGE,
|
OfflineImapError.ERROR.MESSAGE,
|
||||||
exc_info()[2])
|
exc_info()[2])
|
||||||
|
|
||||||
@ -905,7 +902,35 @@ class IMAPFolder(BaseFolder):
|
|||||||
# Convert email, d[0][1], into a message object (from bytes)
|
# Convert email, d[0][1], into a message object (from bytes)
|
||||||
|
|
||||||
ndata0 = data[0][0].decode('utf-8')
|
ndata0 = data[0][0].decode('utf-8')
|
||||||
ndata1 = self.parser['8bit-RFC'].parsebytes(data[0][1])
|
try: ndata1 = self.parser['8bit-RFC'].parsebytes(data[0][1])
|
||||||
|
except:
|
||||||
|
err = exc_info()
|
||||||
|
response_type = type(data[0][1]).__name__
|
||||||
|
msg_id = self._extract_message_id(data[0][1])[0].decode('ascii',errors='surrogateescape')
|
||||||
|
raise OfflineImapError(
|
||||||
|
"Exception parsing message with ID ({}) from imaplib (response type: {}).\n {}: {}".format(
|
||||||
|
msg_id, response_type, err[0].__name__, err[1]),
|
||||||
|
OfflineImapError.ERROR.MESSAGE)
|
||||||
|
if len(ndata1.defects) > 0:
|
||||||
|
# We don't automatically apply fixes as to attempt to preserve the original message
|
||||||
|
self.ui.warn("UID {} has defects: {}".format(uids, ndata1.defects))
|
||||||
|
if any(isinstance(defect, NoBoundaryInMultipartDefect) for defect in ndata1.defects):
|
||||||
|
# (Hopefully) Rare defect from a broken client where multipart boundary is
|
||||||
|
# not properly quoted. Attempt to solve by fixing the boundary and parsing
|
||||||
|
self.ui.warn(" ... applying multipart boundary fix.")
|
||||||
|
ndata1 = self.parser['8bit-RFC'].parsebytes(self._quote_boundary_fix(data[0][1]))
|
||||||
|
try:
|
||||||
|
# See if the defects after fixes are preventing us from obtaining bytes
|
||||||
|
_ = ndata1.as_bytes(policy=self.policy['8bit-RFC'])
|
||||||
|
except UnicodeEncodeError as err:
|
||||||
|
# Unknown issue which is causing failure of as_bytes()
|
||||||
|
msg_id = self.getmessageheader(ndata1, "message-id")
|
||||||
|
if msg_id is None:
|
||||||
|
msg_id = '<Unknown Message-ID>'
|
||||||
|
raise OfflineImapError(
|
||||||
|
"UID {} ({}) has defects preventing it from being processed!\n {}: {}".format(
|
||||||
|
uids, msg_id, type(err).__name__, err),
|
||||||
|
OfflineImapError.ERROR.MESSAGE)
|
||||||
ndata = [ndata0, ndata1]
|
ndata = [ndata0, ndata1]
|
||||||
|
|
||||||
return ndata
|
return ndata
|
||||||
|
@ -25,6 +25,7 @@ from threading import Lock
|
|||||||
from hashlib import md5
|
from hashlib import md5
|
||||||
from offlineimap import OfflineImapError
|
from offlineimap import OfflineImapError
|
||||||
from .Base import BaseFolder
|
from .Base import BaseFolder
|
||||||
|
from email.errors import NoBoundaryInMultipartDefect
|
||||||
|
|
||||||
# Find the UID in a message filename
|
# Find the UID in a message filename
|
||||||
re_uidmatch = re.compile(',U=(\d+)')
|
re_uidmatch = re.compile(',U=(\d+)')
|
||||||
@ -259,8 +260,36 @@ class MaildirFolder(BaseFolder):
|
|||||||
filename = self.messagelist[uid]['filename']
|
filename = self.messagelist[uid]['filename']
|
||||||
filepath = os.path.join(self.getfullname(), filename)
|
filepath = os.path.join(self.getfullname(), filename)
|
||||||
fd = open(filepath, 'rb')
|
fd = open(filepath, 'rb')
|
||||||
retval = self.parser['8bit'].parse(fd)
|
_fd_bytes = fd.read()
|
||||||
fd.close()
|
fd.close()
|
||||||
|
try: retval = self.parser['8bit'].parsebytes(_fd_bytes)
|
||||||
|
except:
|
||||||
|
err = exc_info()
|
||||||
|
msg_id = self._extract_message_id(_fd_bytes)[0].decode('ascii',errors='surrogateescape')
|
||||||
|
raise OfflineImapError(
|
||||||
|
"Exception parsing message with ID ({}) from file ({}).\n {}: {}".format(
|
||||||
|
msg_id, filename, err[0].__name__, err[1]),
|
||||||
|
OfflineImapError.ERROR.MESSAGE)
|
||||||
|
if len(retval.defects) > 0:
|
||||||
|
# We don't automatically apply fixes as to attempt to preserve the original message
|
||||||
|
self.ui.warn("UID {} has defects: {}".format(uid, retval.defects))
|
||||||
|
if any(isinstance(defect, NoBoundaryInMultipartDefect) for defect in retval.defects):
|
||||||
|
# (Hopefully) Rare defect from a broken client where multipart boundary is
|
||||||
|
# not properly quoted. Attempt to solve by fixing the boundary and parsing
|
||||||
|
self.ui.warn(" ... applying multipart boundary fix.")
|
||||||
|
retval = self.parser['8bit'].parsebytes(self._quote_boundary_fix(_fd_bytes))
|
||||||
|
try:
|
||||||
|
# See if the defects after fixes are preventing us from obtaining bytes
|
||||||
|
_ = retval.as_bytes(policy=self.policy['8bit'])
|
||||||
|
except UnicodeEncodeError as err:
|
||||||
|
# Unknown issue which is causing failure of as_bytes()
|
||||||
|
msg_id = self.getmessageheader(retval, "message-id")
|
||||||
|
if msg_id is None:
|
||||||
|
msg_id = '<unknown-message-id>'
|
||||||
|
raise OfflineImapError(
|
||||||
|
"UID {} ({}) has defects preventing it from being processed!\n {}: {}".format(
|
||||||
|
uid, msg_id, type(err).__name__, err),
|
||||||
|
OfflineImapError.ERROR.MESSAGE)
|
||||||
return retval
|
return retval
|
||||||
|
|
||||||
# Interface from BaseFolder
|
# Interface from BaseFolder
|
||||||
|
Loading…
Reference in New Issue
Block a user