From 1d2478bcb670b4cfcfc4923a9936f6764fbc0e27 Mon Sep 17 00:00:00 2001 From: Joseph Ishac Date: Tue, 9 Feb 2021 14:58:30 -0500 Subject: [PATCH] Series of *UNTESTED* changes that should move the internal structure of a message from a string to an email object that is part of the built-in email library. The allows for emails to be processed as bytes and re-encoded properly if they are not UTF-8 or ascii encoded. Currently these changes cover the Base, IMAP, and Maildir classes but not the specialized GMAIL class yet. --- offlineimap/folder/Base.py | 222 ++++++++++------------------------ offlineimap/folder/IMAP.py | 51 ++++---- offlineimap/folder/Maildir.py | 38 +++--- offlineimap/ui/UIBase.py | 2 +- 4 files changed, 111 insertions(+), 202 deletions(-) diff --git a/offlineimap/folder/Base.py b/offlineimap/folder/Base.py index 8b98555..929f515 100644 --- a/offlineimap/folder/Base.py +++ b/offlineimap/folder/Base.py @@ -22,6 +22,11 @@ import re import time from sys import exc_info +from email import policy +from email.parser import BytesParser +from email.generator import BytesGenerator +from email.utils import parsedate_tz, mktime_tz + from offlineimap import threadutil from offlineimap.ui import getglobalui from offlineimap.error import OfflineImapError @@ -42,6 +47,22 @@ class BaseFolder: self.ui = getglobalui() self.messagelist = {} + # Use the built-in email libraries + # Establish some policies + self.policy = { + '7bit': + policy.default.clone(cte_type='7bit',utf8=False,refold_source='none'), + '7bit-RFC': + policy.default.clone(cte_type='7bit',utf8=False,refold_source='none',linesep='\r\n'), + '8bit': + policy.default.clone(cte_type='8bit',utf8=True,refold_source='none'), + '8bit-RFC': + policy.default.clone(cte_type='8bit',utf8=True,refold_source='none',linesep='\r\n'), + } + # Parsers + self.parse = { + '8bit': BytesParser(policy=p1), + } # Save original name for folderfilter operations. self.ffilter_name = name # Top level dir name is always ''. @@ -466,7 +487,7 @@ class BaseFolder: except: raise IOError("Can't read %s" % uidfile) - def savemessage(self, uid, content, flags, rtime): + def savemessage(self, uid, msg, flags, rtime): """Writes a new message, with the specified uid. If the uid is < 0: The backend should assign a new uid and @@ -637,211 +658,90 @@ class BaseFolder: for uid in uidlist: self.deletemessagelabels(uid, labels) - def addmessageheader(self, content, linebreak, headername, headervalue): + def addmessageheader(self, msg, headername, headervalue): """Adds new header to the provided message. - WARNING: This function is a bit tricky, and modifying it in the wrong - way, may easily lead to data-loss. - Arguments: - - content: message content, headers and body as a single string - - linebreak: string that carries line ending + - msg: message itself - headername: name of the header to add - headervalue: value of the header to add - .. note:: + Returns: None - The following documentation will not get displayed correctly after - being processed by Sphinx. View the source of this method to read it. - - This has to deal with strange corner cases where the header is - missing or empty. Here are illustrations for all the cases, - showing where the header gets inserted and what the end result - is. In each illustration, '+' means the added contents. Note - that these examples assume LF for linebreak, not CRLF, so '\n' - denotes a linebreak and '\n\n' corresponds to the transition - between header and body. However if the linebreak parameter - is set to '\r\n' then you would have to substitute '\r\n' for - '\n' in the below examples. - - * Case 1: No '\n\n', leading '\n' - - +X-Flying-Pig-Header: i am here\n - \n - This is the body\n - next line\n - - * Case 2: '\n\n' at position 0 - - +X-Flying-Pig-Header: i am here - \n - \n - This is the body\n - next line\n - - * Case 3: No '\n\n', no leading '\n' - - +X-Flying-Pig-Header: i am here\n - +\n - This is the body\n - next line\n - - * Case 4: '\n\n' at non-zero position - - Subject: Something wrong with OI\n - From: some@person.at - +\nX-Flying-Pig-Header: i am here - \n - \n - This is the body\n - next line\n """ self.ui.debug('', 'addmessageheader: called to add %s: %s' % (headername, headervalue)) - insertionpoint = content.find(linebreak * 2) - if insertionpoint == -1: - self.ui.debug('', 'addmessageheader: headers were missing') - else: - self.ui.debug('', - 'addmessageheader: headers end at position %d' % - insertionpoint) - mark = '==>EOH<==' - contextstart = max(0, insertionpoint - 100) - contextend = min(len(content), insertionpoint + 100) - self.ui.debug('', 'addmessageheader: header/body transition " \ - "context (marked by %s): %s%s%s' % ( - mark, repr(content[contextstart:insertionpoint]), - mark, repr(content[insertionpoint:contextend]) - ) - ) + msg.add_header(headername,headervalue) + return - # Hoping for case #4. - prefix = linebreak - suffix = '' - # Case #2. - if insertionpoint == 0: - prefix = '' - suffix = '' - # Either case #1 or #3. - elif insertionpoint == -1: - prefix = '' - suffix = linebreak - insertionpoint = 0 - # Case #3: when body starts immediately, without preceding '\n' - # (this shouldn't happen with proper mail messages, but - # we seen many broken ones), we should add '\n' to make - # new (and the only header, in this case) to be properly - # separated from the message body. - if content[0:len(linebreak)] != linebreak: - suffix = suffix + linebreak - - self.ui.debug('', - 'addmessageheader: insertionpoint = %d' % insertionpoint) - headers = content[0:insertionpoint] - self.ui.debug('', - 'addmessageheader: headers = %s' % repr(headers)) - new_header = prefix + ("%s: %s" % (headername, headervalue)) + suffix - self.ui.debug('', - 'addmessageheader: new_header = %s' % repr(new_header)) - return headers + new_header + content[insertionpoint:] - - def __find_eoh(self, content): - """Searches for the point where mail headers end. - - Either double '\n', or end of string. - - Arguments: - - content: contents of the message to search in - Returns: position of the first non-header byte. - """ - - eoh_cr = content.find('\n\n') - if eoh_cr == -1: - eoh_cr = len(content) - - return eoh_cr - - def getmessageheader(self, content, name): - """Return the value of the first occurence of the given header. + def getmessageheader(self, msg, headername): + """Return the value of an undefined occurence of the given header. Header name is case-insensitive. Arguments: - - contents: message itself - - name: name of the header to be searched + - msg: message itself + - headername: name of the header to be searched Returns: header value or None if no such header was found. """ self.ui.debug('', 'getmessageheader: called to get %s' % name) - eoh = self.__find_eoh(content) - self.ui.debug('', 'getmessageheader: eoh = %d' % eoh) - headers = content[0:eoh] - self.ui.debug('', 'getmessageheader: headers = %s' % repr(headers)) + return msg.get(headername) - m = re.search('^%s:(.*)$' % name, headers, - flags=re.MULTILINE | re.IGNORECASE) - if m: - return m.group(1).strip() - else: - return None - - def getmessageheaderlist(self, content, name): + def getmessageheaderlist(self, msg, headername): """Return a list of values for the given header. + Header name is case-insensitive. + Arguments: - - contents: message itself - - name: name of the header to be searched + - msg: message itself + - headername: name of the header to be searched Returns: list of header values or empty list if no such header was found. """ self.ui.debug('', 'getmessageheaderlist: called to get %s' % name) - eoh = self.__find_eoh(content) - self.ui.debug('', 'getmessageheaderlist: eoh = %d' % eoh) - headers = content[0:eoh] - self.ui.debug('', 'getmessageheaderlist: headers = %s' % repr(headers)) + return msg.get_all(headername,[]) - return re.findall('^%s:(.*)$' % - name, headers, flags=re.MULTILINE | re.IGNORECASE) - - def deletemessageheaders(self, content, header_list): - """Deletes headers in the given list from the message content. + def deletemessageheaders(self, msg, header_list): + """Deletes headers in the given list from the message. Arguments: - - content: message itself + - msg: message itself - header_list: list of headers to be deleted or just the header name - We expect our message to have '\n' as line endings.""" + """ if type(header_list) != type([]): header_list = [header_list] self.ui.debug('', 'deletemessageheaders: called to delete %s' % header_list) - if not len(header_list): - return content + for h in header_list: + del msg[h] - eoh = self.__find_eoh(content) - self.ui.debug('', 'deletemessageheaders: end of headers = %d' % eoh) - headers = content[0:eoh] - rest = content[eoh:] - self.ui.debug('', 'deletemessageheaders: headers = %s' % repr(headers)) - new_headers = [] - for h in headers.split('\n'): - keep_it = True - for trim_h in header_list: - if len(h) > len(trim_h) \ - and h[0:len(trim_h) + 1] == (trim_h + ":"): - keep_it = False - break - if keep_it: - new_headers.append(h) + return - return '\n'.join(new_headers) + rest + def get_message_date(self, msg, header="Date"): + """Returns the Unix timestamp of the email message, derived from the + Date field header by default. + + Arguments: + - msg: message itself + - header: headers to extract the date from + + Returns: timestamp or `None` in the case of failure. + """ + + datetuple = parsedate_tz(msg.get(header)) + if datetuple is None: + return None + + return mktime_tz(datetuple) def change_message_uid(self, uid, new_uid): """Change the message from existing uid to new_uid. diff --git a/offlineimap/folder/IMAP.py b/offlineimap/folder/IMAP.py index f08361b..658b20b 100644 --- a/offlineimap/folder/IMAP.py +++ b/offlineimap/folder/IMAP.py @@ -375,7 +375,7 @@ class IMAPFolder(BaseFolder): def getmessagekeywords(self, uid): return self.messagelist[uid]['keywords'] - def __generate_randomheader(self, content): + def __generate_randomheader(self, msg, policy=None): """Returns a unique X-OfflineIMAP header Generate an 'X-OfflineIMAP' mail header which contains a random @@ -390,6 +390,10 @@ class IMAPFolder(BaseFolder): """ headername = 'X-OfflineIMAP' + if policy is None: + output_policy = self.policy['8bit-RFC'] + else: + output_policy = policy # We need a random component too. If we ever upload the same # mail twice (e.g. in different folders), we would still need to # get the UID for the correct one. As we won't have too many @@ -398,9 +402,9 @@ class IMAPFolder(BaseFolder): # Compute unsigned crc32 of 'content' as unique hash. # NB: crc32 returns unsigned only starting with python 3.0. - headervalue = str(binascii.crc32(str.encode(content)) - & 0xffffffff) + '-' - headervalue += str(self.randomgenerator.randint(0, 9999999999)) + headervalue = '{}-{}'.format( + (binascii.crc32(msg.as_bytes(policy=output_policy)) & 0xffffffff), + self.randomgenerator.randint(0, 9999999999)) return headername, headervalue def __savemessage_searchforheader(self, imapobj, headername, headervalue): @@ -539,7 +543,7 @@ class IMAPFolder(BaseFolder): return 0 - def __getmessageinternaldate(self, content, rtime=None): + def __getmessageinternaldate(self, msg, rtime=None): """Parses mail and returns an INTERNALDATE string It will use information in the following order, falling back as an @@ -571,7 +575,7 @@ class IMAPFolder(BaseFolder): (which is fine as value for append).""" if rtime is None: - rtime = emailutil.get_message_date(content) + rtime = self.get_message_date(msg) if rtime is None: return None datetuple = time.localtime(rtime) @@ -619,7 +623,7 @@ class IMAPFolder(BaseFolder): return internaldate # Interface from BaseFolder - def savemessage(self, uid, content, flags, rtime): + def savemessage(self, uid, msg, flags, rtime): """Save the message on the Server This backend always assigns a new uid, so the uid arg is ignored. @@ -632,7 +636,7 @@ class IMAPFolder(BaseFolder): savemessage is never called in a dryrun mode. :param uid: Message UID - :param content: Message content + :param msg: Message Object :param flags: Message flags :param rtime: A timestamp to be used as the mail date :returns: the UID of the new message as assigned by the server. If the @@ -647,16 +651,19 @@ class IMAPFolder(BaseFolder): self.savemessageflags(uid, flags) return uid - content = self.deletemessageheaders(content, self.filterheaders) + # Filter user requested headers before uploading to the IMAP server + self.deletemessageheaders(msg, self.filterheaders) - # Use proper CRLF all over the message. - content = re.sub("(? 200: - dbg_output = "%s...%s" % (content[:150], content[-50:]) + dbg_output = "%s...%s" % (msg_s[:150], msg_s[-50:]) else: - dbg_output = content + dbg_output = msg_s self.ui.debug('imap', "savemessage: date: %s, content: '%s'" % (date, dbg_output)) @@ -695,7 +702,7 @@ class IMAPFolder(BaseFolder): except imapobj.readonly: # readonly exception. Return original uid to notify that # we did not save the message. (see savemessage in Base.py) - self.ui.msgtoreadonly(self, uid, content, flags) + self.ui.msgtoreadonly(self, uid) return uid # Do the APPEND. @@ -703,7 +710,7 @@ class IMAPFolder(BaseFolder): (typ, dat) = imapobj.append( self.getfullIMAPname(), imaputil.flagsmaildir2imap(flags), - date, bytes(content, 'utf-8')) + date, msg.as_bytes(policy=output_policy)) # This should only catch 'NO' responses since append() # will raise an exception for 'BAD' responses: if typ != 'OK': @@ -716,12 +723,12 @@ class IMAPFolder(BaseFolder): # In this case, we should immediately abort # the repository sync and continue # with the next account. - msg = \ + err_msg = \ "Saving msg (%s) in folder '%s', " \ "repository '%s' failed (abort). " \ "Server responded: %s %s\n" % \ (msg_id, self, self.getrepository(), typ, dat) - raise OfflineImapError(msg, OfflineImapError.ERROR.REPO) + raise OfflineImapError(err_msg, OfflineImapError.ERROR.REPO) retry_left = 0 # Mark as success. except imapobj.abort as e: # Connection has been reset, release connection and retry. diff --git a/offlineimap/folder/Maildir.py b/offlineimap/folder/Maildir.py index 205feea..42e2f0c 100644 --- a/offlineimap/folder/Maildir.py +++ b/offlineimap/folder/Maildir.py @@ -258,12 +258,10 @@ class MaildirFolder(BaseFolder): filename = self.messagelist[uid]['filename'] filepath = os.path.join(self.getfullname(), filename) - file = open(filepath, 'rt') - retval = file.read() - file.close() - # TODO: WHY are we replacing \r\n with \n here? And why do we - # read it as text? - return retval.replace("\r\n", "\n") + fd = open(filepath, 'rb') + retval = self.parse['8bit'](fd) + fd.close() + return retval # Interface from BaseFolder def getmessagetime(self, uid): @@ -288,17 +286,21 @@ class MaildirFolder(BaseFolder): uid, self._foldermd5, self.infosep, ''.join(sorted(flags))) return uniq_name.replace(os.path.sep, self.sep_subst) - def save_to_tmp_file(self, filename, content): + def save_to_tmp_file(self, filename, msg, policy=None): """Saves given content to the named temporary file in the 'tmp' subdirectory of $CWD. Arguments: - filename: name of the temporary file; - - content: data to be saved. + - msg: Email message object Returns: relative path to the temporary file that was created.""" + if policy is None: + output_policy = self.policy['8bit'] + else: + output_policy = policy tmpname = os.path.join('tmp', filename) # Open file and write it out. # XXX: why do we need to loop 7 times? @@ -324,8 +326,8 @@ class MaildirFolder(BaseFolder): else: raise - fd = os.fdopen(fd, 'wt') - fd.write(content) + fd = os.fdopen(fd, 'wb') + fd.write(msg.as_bytes(policy=output_policy)) # Make sure the data hits the disk. fd.flush() if self.dofsync(): @@ -335,7 +337,7 @@ class MaildirFolder(BaseFolder): return tmpname # Interface from BaseFolder - def savemessage(self, uid, content, flags, rtime): + def savemessage(self, uid, msg, flags, rtime): """Writes a new message, with the specified uid. See folder/Base for detail. Note that savemessage() does not @@ -359,15 +361,15 @@ class MaildirFolder(BaseFolder): message_timestamp = None if self._filename_use_mail_timestamp is not False: try: - message_timestamp = emailutil.get_message_date(content, 'Date') + message_timestamp = self.get_message_date(msg, 'Date') if message_timestamp is None: # Give a try with Delivery-date - message_timestamp = emailutil.get_message_date( - content, 'Delivery-date') + message_timestamp = self.get_message_date( + msg, 'Delivery-date') except Exception as e: # This should never happen. from offlineimap.ui import getglobalui - datestr = emailutil.get_message_date(content) + datestr = self.get_message_date(msg) ui = getglobalui() ui.warn("UID %d has invalid date %s: %s\n" "Not using message timestamp as file prefix" % @@ -375,11 +377,11 @@ class MaildirFolder(BaseFolder): # No need to check if message_timestamp is None here since it # would be overridden by _gettimeseq. messagename = self.new_message_filename(uid, flags, date=message_timestamp) - tmpname = self.save_to_tmp_file(messagename, content) + tmpname = self.save_to_tmp_file(messagename, msg) if self._utime_from_header is True: try: - date = emailutil.get_message_date(content, 'Date') + date = self.get_message_date(msg, 'Date') if date is not None: os.utime(os.path.join(self.getfullname(), tmpname), (date, date)) @@ -387,7 +389,7 @@ class MaildirFolder(BaseFolder): # int32. except Exception as e: from offlineimap.ui import getglobalui - datestr = emailutil.get_message_date(content) + datestr = self.get_message_date(msg) ui = getglobalui() ui.warn("UID %d has invalid date %s: %s\n" "Not changing file modification time" % (uid, datestr, e)) diff --git a/offlineimap/ui/UIBase.py b/offlineimap/ui/UIBase.py index 0307b23..1f0a8dd 100644 --- a/offlineimap/ui/UIBase.py +++ b/offlineimap/ui/UIBase.py @@ -266,7 +266,7 @@ class UIBase: (self.getnicename(x), x.getname()) for x in folder_list]) # WARNINGS - def msgtoreadonly(self, destfolder, uid, content, flags): + def msgtoreadonly(self, destfolder, uid): if self.config.has_option('general', 'ignore-readonly') and \ self.config.getboolean('general', 'ignore-readonly'): return