Merge pull request #72 from jishac/encoding_edge_cases
Encoding: edge cases and error handling
This commit is contained in:
		| @@ -841,6 +841,76 @@ class BaseFolder: | |||||||
|                               (uid, self.accountname)) |                               (uid, self.accountname)) | ||||||
|             raise  # Raise on unknown errors, so we can fix those. |             raise  # Raise on unknown errors, so we can fix those. | ||||||
|  |  | ||||||
|  |     def _extract_message_id(self, raw_msg_bytes): | ||||||
|  |         """Extract the Message-ID from a bytes object containing a raw message. | ||||||
|  |  | ||||||
|  |         This function attempts to find the Message-ID for a message that has not | ||||||
|  |         been processed by the built-in email library, and is therefore NOT an | ||||||
|  |         email object.  If parsing the message fails (or is otherwise not | ||||||
|  |         needed), this utility can be useful to help provide a (hopefully) unique | ||||||
|  |         identifier in log messages to facilitate locating the message on disk. | ||||||
|  |  | ||||||
|  |         :param raw_msg_bytes: bytes object containing the raw email message. | ||||||
|  |         :returns: A tuple containing the contents of the Message-ID header if | ||||||
|  |         found (or <Unknown Message-ID> if not found) and a flag which is True if | ||||||
|  |         the Message-ID was in proper RFC format or False if it contained | ||||||
|  |         defects. | ||||||
|  |         """ | ||||||
|  |         msg_header = re.split(b'[\r]?\n[\r]?\n', raw_msg_bytes)[0] | ||||||
|  |         try: | ||||||
|  |             msg_id = re.search(b"\nmessage-id:[\s]+(<[A-Za-z0-9!#$%&'*+-/=?^_`{}|~.@ ]+>)",  | ||||||
|  |                 msg_header, re.IGNORECASE).group(1) | ||||||
|  |         except AttributeError: | ||||||
|  |             # No match - Likely not following RFC rules.  Try and find anything | ||||||
|  |             # that looks like it could be the Message-ID but flag it. | ||||||
|  |             _start_pos = msg_header.find(b'\nMessage-ID:') | ||||||
|  |             if _start_pos > 0: | ||||||
|  |                 _end_pos = msg_header.find(b'\n',_start_pos+15) | ||||||
|  |                 msg_id = msg_header[_start_pos+12:_end_pos].strip() | ||||||
|  |                 return (msg_id, False) | ||||||
|  |             else: | ||||||
|  |                 return (b"<Unknown Message-ID>", False) | ||||||
|  |         return (msg_id, True) | ||||||
|  |  | ||||||
|  |     def _quote_boundary_fix(self, raw_msg_bytes): | ||||||
|  |         """Modify a raw message to quote the boundary separator for multipart messages. | ||||||
|  |  | ||||||
|  |         This function quotes only the first occurrence of the boundary field in | ||||||
|  |         the email header, and quotes any boundary value.  Improperly quoted | ||||||
|  |         boundary fields can give the internal python email library issues. | ||||||
|  |  | ||||||
|  |         :param raw_msg_bytes: bytes object containing the raw email message. | ||||||
|  |         :returns: The raw byte stream containing the quoted boundary | ||||||
|  |         """ | ||||||
|  |         # Use re.split to extract just the header, and search for the boundary in | ||||||
|  |         # the context-type header and extract just the boundary and characters per | ||||||
|  |         # RFC 2046 ( see https://tools.ietf.org/html/rfc2046#section-5.1.1 ) | ||||||
|  |         # We don't cap the length to 70 characters, because we are just trying to | ||||||
|  |         # soft fix this message to resolve the python library looking for properly | ||||||
|  |         # quoted boundaries. | ||||||
|  |         try: boundary_field = \ | ||||||
|  |             re.search(b"content-type:.*(boundary=[\"]?[A-Za-z0-9'()+_,-./:=? ]+[\"]?)", | ||||||
|  |               re.split(b'[\r]?\n[\r]?\n', raw_msg_bytes)[0], | ||||||
|  |               (re.IGNORECASE|re.DOTALL)).group(1) | ||||||
|  |         except AttributeError: | ||||||
|  |             # No match | ||||||
|  |             return raw_msg_bytes | ||||||
|  |         # get the boundary field, and strip off any trailing ws (against RFC rules, leading ws is OK) | ||||||
|  |         # if it was already quoted, well then there was nothing to fix | ||||||
|  |         boundary, value = boundary_field.split(b'=', 1) | ||||||
|  |         value = value.rstrip() | ||||||
|  |         # ord(b'"') == 34 | ||||||
|  |         if value[0] == value[-1] == 34: | ||||||
|  |             # Sanity Check - Do not requote if already quoted. | ||||||
|  |             # A quoted boundary was the end goal so return the original | ||||||
|  |             # | ||||||
|  |             # No need to worry about if the original email did something like: | ||||||
|  |             # boundary="ahahah  " as the email library will trim the ws for us | ||||||
|  |             return raw_msg_bytes | ||||||
|  |         else: | ||||||
|  |             new_field = b''.join([boundary, b'="', value, b'"']) | ||||||
|  |             return(raw_msg_bytes.replace(boundary_field, new_field, 1)) | ||||||
|  |  | ||||||
|     def __syncmessagesto_copy(self, dstfolder, statusfolder): |     def __syncmessagesto_copy(self, dstfolder, statusfolder): | ||||||
|         """Pass1: Copy locally existing messages not on the other side. |         """Pass1: Copy locally existing messages not on the other side. | ||||||
|  |  | ||||||
|   | |||||||
| @@ -24,6 +24,7 @@ from offlineimap import imaputil, imaplibutil, OfflineImapError | |||||||
| from offlineimap import globals | from offlineimap import globals | ||||||
| from imaplib2 import MonthNames | from imaplib2 import MonthNames | ||||||
| from .Base import BaseFolder | from .Base import BaseFolder | ||||||
|  | from email.errors import NoBoundaryInMultipartDefect | ||||||
|  |  | ||||||
| # Globals | # Globals | ||||||
| CRLF = '\r\n' | CRLF = '\r\n' | ||||||
| @@ -735,10 +736,8 @@ class IMAPFolder(BaseFolder): | |||||||
|                         raise OfflineImapError( |                         raise OfflineImapError( | ||||||
|                             "Saving msg (%s) in folder '%s', " |                             "Saving msg (%s) in folder '%s', " | ||||||
|                             "repository '%s' failed (abort). " |                             "repository '%s' failed (abort). " | ||||||
|                             "Server responded: %s\n" |                             "Server responded: %s\n" % | ||||||
|                             "Message content was: %s" % |                             (msg_id, self, self.getrepository(), str(e)), | ||||||
|                             (msg_id, self, self.getrepository(), |  | ||||||
|                              str(e), dbg_output), |  | ||||||
|                             OfflineImapError.ERROR.MESSAGE, |                             OfflineImapError.ERROR.MESSAGE, | ||||||
|                             exc_info()[2]) |                             exc_info()[2]) | ||||||
|  |  | ||||||
| @@ -752,10 +751,8 @@ class IMAPFolder(BaseFolder): | |||||||
|                     imapobj = None |                     imapobj = None | ||||||
|                     raise OfflineImapError( |                     raise OfflineImapError( | ||||||
|                         "Saving msg (%s) folder '%s', repo '%s'" |                         "Saving msg (%s) folder '%s', repo '%s'" | ||||||
|                         "failed (error). Server responded: %s\n" |                         "failed (error). Server responded: %s\n" % | ||||||
|                         "Message content was: %s" % |                         (msg_id, self, self.getrepository(), str(e)), | ||||||
|                         (msg_id, self, self.getrepository(), |  | ||||||
|                          str(e), dbg_output), |  | ||||||
|                         OfflineImapError.ERROR.MESSAGE, |                         OfflineImapError.ERROR.MESSAGE, | ||||||
|                         exc_info()[2]) |                         exc_info()[2]) | ||||||
|  |  | ||||||
| @@ -905,7 +902,35 @@ class IMAPFolder(BaseFolder): | |||||||
|         # Convert email, d[0][1], into a message object (from bytes)  |         # Convert email, d[0][1], into a message object (from bytes)  | ||||||
|  |  | ||||||
|         ndata0 = data[0][0].decode('utf-8') |         ndata0 = data[0][0].decode('utf-8') | ||||||
|         ndata1 = self.parser['8bit-RFC'].parsebytes(data[0][1]) |         try: ndata1 = self.parser['8bit-RFC'].parsebytes(data[0][1]) | ||||||
|  |         except: | ||||||
|  |             err = exc_info() | ||||||
|  |             response_type = type(data[0][1]).__name__ | ||||||
|  |             msg_id = self._extract_message_id(data[0][1])[0].decode('ascii',errors='surrogateescape') | ||||||
|  |             raise OfflineImapError( | ||||||
|  |                 "Exception parsing message with ID ({}) from imaplib (response type: {}).\n {}: {}".format( | ||||||
|  |                     msg_id, response_type, err[0].__name__, err[1]), | ||||||
|  |                 OfflineImapError.ERROR.MESSAGE) | ||||||
|  |         if len(ndata1.defects) > 0: | ||||||
|  |             # We don't automatically apply fixes as to attempt to preserve the original message | ||||||
|  |             self.ui.warn("UID {} has defects: {}".format(uids, ndata1.defects)) | ||||||
|  |             if any(isinstance(defect, NoBoundaryInMultipartDefect) for defect in ndata1.defects): | ||||||
|  |                 # (Hopefully) Rare defect from a broken client where multipart boundary is | ||||||
|  |                 # not properly quoted.  Attempt to solve by fixing the boundary and parsing | ||||||
|  |                 self.ui.warn(" ... applying multipart boundary fix.") | ||||||
|  |                 ndata1 = self.parser['8bit-RFC'].parsebytes(self._quote_boundary_fix(data[0][1])) | ||||||
|  |             try: | ||||||
|  |                 # See if the defects after fixes are preventing us from obtaining bytes | ||||||
|  |                 _ = ndata1.as_bytes(policy=self.policy['8bit-RFC']) | ||||||
|  |             except UnicodeEncodeError as err: | ||||||
|  |                 # Unknown issue which is causing failure of as_bytes() | ||||||
|  |                 msg_id = self.getmessageheader(ndata1, "message-id") | ||||||
|  |                 if msg_id is None: | ||||||
|  |                     msg_id = '<Unknown Message-ID>' | ||||||
|  |                 raise OfflineImapError( | ||||||
|  |                         "UID {} ({}) has defects preventing it from being processed!\n  {}: {}".format( | ||||||
|  |                             uids, msg_id, type(err).__name__, err), | ||||||
|  |                         OfflineImapError.ERROR.MESSAGE) | ||||||
|         ndata = [ndata0, ndata1] |         ndata = [ndata0, ndata1] | ||||||
|  |  | ||||||
|         return ndata |         return ndata | ||||||
|   | |||||||
| @@ -25,6 +25,7 @@ from threading import Lock | |||||||
| from hashlib import md5 | from hashlib import md5 | ||||||
| from offlineimap import OfflineImapError | from offlineimap import OfflineImapError | ||||||
| from .Base import BaseFolder | from .Base import BaseFolder | ||||||
|  | from email.errors import NoBoundaryInMultipartDefect | ||||||
|  |  | ||||||
| # Find the UID in a message filename | # Find the UID in a message filename | ||||||
| re_uidmatch = re.compile(',U=(\d+)') | re_uidmatch = re.compile(',U=(\d+)') | ||||||
| @@ -259,8 +260,36 @@ class MaildirFolder(BaseFolder): | |||||||
|         filename = self.messagelist[uid]['filename'] |         filename = self.messagelist[uid]['filename'] | ||||||
|         filepath = os.path.join(self.getfullname(), filename) |         filepath = os.path.join(self.getfullname(), filename) | ||||||
|         fd = open(filepath, 'rb') |         fd = open(filepath, 'rb') | ||||||
|         retval = self.parser['8bit'].parse(fd) |         _fd_bytes = fd.read() | ||||||
|         fd.close() |         fd.close() | ||||||
|  |         try: retval = self.parser['8bit'].parsebytes(_fd_bytes) | ||||||
|  |         except: | ||||||
|  |             err = exc_info() | ||||||
|  |             msg_id = self._extract_message_id(_fd_bytes)[0].decode('ascii',errors='surrogateescape') | ||||||
|  |             raise OfflineImapError( | ||||||
|  |                 "Exception parsing message with ID ({}) from file ({}).\n {}: {}".format( | ||||||
|  |                     msg_id, filename, err[0].__name__, err[1]), | ||||||
|  |                 OfflineImapError.ERROR.MESSAGE) | ||||||
|  |         if len(retval.defects) > 0: | ||||||
|  |             # We don't automatically apply fixes as to attempt to preserve the original message | ||||||
|  |             self.ui.warn("UID {} has defects: {}".format(uid, retval.defects)) | ||||||
|  |             if any(isinstance(defect, NoBoundaryInMultipartDefect) for defect in retval.defects): | ||||||
|  |                 # (Hopefully) Rare defect from a broken client where multipart boundary is | ||||||
|  |                 # not properly quoted.  Attempt to solve by fixing the boundary and parsing | ||||||
|  |                 self.ui.warn(" ... applying multipart boundary fix.") | ||||||
|  |                 retval = self.parser['8bit'].parsebytes(self._quote_boundary_fix(_fd_bytes)) | ||||||
|  |             try: | ||||||
|  |                 # See if the defects after fixes are preventing us from obtaining bytes | ||||||
|  |                 _ = retval.as_bytes(policy=self.policy['8bit']) | ||||||
|  |             except UnicodeEncodeError as err: | ||||||
|  |                 # Unknown issue which is causing failure of as_bytes() | ||||||
|  |                 msg_id = self.getmessageheader(retval, "message-id") | ||||||
|  |                 if msg_id is None: | ||||||
|  |                     msg_id = '<unknown-message-id>' | ||||||
|  |                 raise OfflineImapError( | ||||||
|  |                         "UID {} ({}) has defects preventing it from being processed!\n  {}: {}".format( | ||||||
|  |                             uid, msg_id, type(err).__name__, err), | ||||||
|  |                         OfflineImapError.ERROR.MESSAGE) | ||||||
|         return retval |         return retval | ||||||
|  |  | ||||||
|     # Interface from BaseFolder |     # Interface from BaseFolder | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Rodolfo García Peñas (kix)
					Rodolfo García Peñas (kix)