From c1c200a487cc36381d17275f2f439665c3f11cb0 Mon Sep 17 00:00:00 2001 From: Sebastian Spaeth Date: Fri, 10 Jun 2011 17:32:39 +0200 Subject: [PATCH 1/3] folder/Maildir: cache getfullname() value We use getfullname() very often (thousands to millions), yet we dynamically calculate the very same value over and over. Optimize this by caching the value once and be done with it. Signed-off-by: Sebastian Spaeth Signed-off-by: Nicolas Sebrecht --- offlineimap/folder/Maildir.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/offlineimap/folder/Maildir.py b/offlineimap/folder/Maildir.py index e360619..b9f9d05 100644 --- a/offlineimap/folder/Maildir.py +++ b/offlineimap/folder/Maildir.py @@ -63,12 +63,15 @@ class MaildirFolder(BaseFolder): self.accountname = accountname BaseFolder.__init__(self) #self.ui is set in BaseFolder.init() + # Cache the full folder path, as we use getfullname() very often + self._fullname = os.path.join(self.getroot(), self.getname()) def getaccountname(self): return self.accountname def getfullname(self): - return os.path.join(self.getroot(), self.getname()) + """Return the absolute file path to the Maildir folder (sans cur|new)""" + return self._fullname def getuidvalidity(self): """Maildirs have no notion of uidvalidity, so we just return a magic From e023f190b0eed84fefc10e28bfe5e4e0467ff257 Mon Sep 17 00:00:00 2001 From: Sebastian Spaeth Date: Fri, 10 Jun 2011 17:32:40 +0200 Subject: [PATCH 2/3] folder/Maildir: Store only relative filename components MaildirFolder.messagelist[*]['filename'] was storing the absolute file paths for all stored emails. While this is convenient, it wastes much space, as the folder prefix is always the same and it is known to the MaildirFolder. Just 40 chars in a folder with 100k mails waste >4MB of space. Adapt the few locations where we need the full path to construct it dynamically. Signed-off-by: Sebastian Spaeth Signed-off-by: Nicolas Sebrecht --- offlineimap/folder/Maildir.py | 47 +++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/offlineimap/folder/Maildir.py b/offlineimap/folder/Maildir.py index b9f9d05..a798b36 100644 --- a/offlineimap/folder/Maildir.py +++ b/offlineimap/folder/Maildir.py @@ -185,27 +185,32 @@ class MaildirFolder(BaseFolder): return self.messagelist def getmessage(self, uid): + """Return the content of the message""" filename = self.messagelist[uid]['filename'] - file = open(filename, 'rt') + filepath = os.path.join(self.getfullname(), filename) + file = open(filepath, 'rt') retval = file.read() file.close() + #TODO: WHY are we replacing \r\n with \n here? And why do we + # read it as text? return retval.replace("\r\n", "\n") def getmessagetime( self, uid ): filename = self.messagelist[uid]['filename'] - st = os.stat(filename) + filepath = os.path.join(self.getfullname(), filename) + st = os.stat(filepath) return st.st_mtime def savemessage(self, uid, content, flags, rtime): # This function only ever saves to tmp/, # but it calls savemessageflags() to actually save to cur/ or new/. - self.ui.debug('maildir', 'savemessage: called to write with flags %s and content %s' % \ - (repr(flags), repr(content))) + self.ui.debug('maildir', 'savemessage: called to write with flags %s ' + 'and content %s' % (repr(flags), repr(content))) if uid < 0: # We cannot assign a new uid. return uid if uid in self.messagelist: - # We already have it. + # We already have it, just update flags. self.savemessageflags(uid, flags) return uid @@ -231,7 +236,8 @@ class MaildirFolder(BaseFolder): else: break tmpmessagename = messagename.split(',')[0] - self.ui.debug('maildir', 'savemessage: using temporary name %s' % tmpmessagename) + self.ui.debug('maildir', 'savemessage: using temporary name %s' %\ + tmpmessagename) file = open(os.path.join(tmpdir, tmpmessagename), "wt") file.write(content) @@ -239,10 +245,10 @@ class MaildirFolder(BaseFolder): file.flush() if self.dofsync: os.fsync(file.fileno()) - file.close() + if rtime != None: - os.utime(os.path.join(tmpdir,tmpmessagename), (rtime,rtime)) + os.utime(os.path.join(tmpdir, tmpmessagename), (rtime, rtime)) self.ui.debug('maildir', 'savemessage: moving from %s to %s' % \ (tmpmessagename, messagename)) if tmpmessagename != messagename: # then rename it @@ -259,7 +265,8 @@ class MaildirFolder(BaseFolder): pass self.messagelist[uid] = {'uid': uid, 'flags': [], - 'filename': os.path.join(tmpdir, messagename)} + 'filename': os.path.join('tmp', messagename)} + # savemessageflags moves msg to 'cur' or 'new' as appropriate self.savemessageflags(uid, flags) self.ui.debug('maildir', 'savemessage: returning uid %d' % uid) return uid @@ -269,14 +276,14 @@ class MaildirFolder(BaseFolder): def savemessageflags(self, uid, flags): oldfilename = self.messagelist[uid]['filename'] - newpath, newname = os.path.split(oldfilename) + dir_prefix, newname = os.path.split(oldfilename) tmpdir = os.path.join(self.getfullname(), 'tmp') if 'S' in flags: # If a message has been seen, it goes into the cur - # directory. CR debian#152482, [complete.org #4] - newpath = os.path.join(self.getfullname(), 'cur') + # directory. CR debian#152482 + dir_prefix = 'cur' else: - newpath = os.path.join(self.getfullname(), 'new') + dir_prefix = 'new' infostr = ':' infomatch = re.search('(:.*)$', newname) if infomatch: # If the info string is present.. @@ -287,15 +294,16 @@ class MaildirFolder(BaseFolder): infostr += '2,' + ''.join(flags) newname += infostr - newfilename = os.path.join(newpath, newname) + newfilename = os.path.join(dir_prefix, newname) if (newfilename != oldfilename): - os.rename(oldfilename, newfilename) + os.rename(os.path.join(self.getfullname(), oldfilename), + os.path.join(self.getfullname(), newfilename)) self.messagelist[uid]['flags'] = flags self.messagelist[uid]['filename'] = newfilename # By now, the message had better not be in tmp/ land! final_dir, final_name = os.path.split(self.messagelist[uid]['filename']) - assert final_dir != tmpdir + assert final_dir != 'tmp' def deletemessage(self, uid): """Unlinks a message file from the Maildir. @@ -309,13 +317,16 @@ class MaildirFolder(BaseFolder): return filename = self.messagelist[uid]['filename'] + filepath = os.path.join(self.getfullname(), filename) try: - os.unlink(filename) + os.unlink(filepath) except OSError: # Can't find the file -- maybe already deleted? newmsglist = self._scanfolder() if uid in newmsglist: # Nope, try new filename. - os.unlink(newmsglist[uid]['filename']) + filename = newmsglist[uid]['filename'] + filepath = os.path.join(self.getfullname(), filename) + os.unlink(filepath) # Yep -- return. del(self.messagelist[uid]) From d8026c5308d25aeafd8ef6c6804e1ea757da1ff2 Mon Sep 17 00:00:00 2001 From: Sebastian Spaeth Date: Mon, 13 Jun 2011 15:22:37 +0200 Subject: [PATCH 3/3] Simplify Maildir message saving Previously we were attempting to save out mails according to http://www.qmail.org/man/man5/maildir.html in 4 steps: 1 Create a unique filename 2 Do stat(tmp/). If it found a file, wait 2 sec and go back to 1. 3 Create and write the message to the tmp/. 4 Link from tmp/* to new/* (we did step 2 up to 15 times) But as stated by http://wiki1.dovecot.org/MailboxFormat/Maildir (see section 'Issues with the specification'), this is a pointless approach, e.g. there are race issues between stating that the filename does not exist and the actual moving (when it might exist). So, we can simplify the steps as suggested in the dovecot wiki and tighten up our safety at the same time. One improvement that we do is to open the file, guaranteeing that it did not exist before in an atomic manner, thus our simplified approach is really more secure than what we had before. Also, we throw an OfflineImapError at MESSAGE level when the supposedly unique filename already exists, so that we can skip this message and still continue with other messages. Signed-off-by: Sebastian Spaeth Signed-off-by: Nicolas Sebrecht --- Changelog.draft.rst | 3 ++ offlineimap/folder/Maildir.py | 63 ++++++++++++++--------------------- 2 files changed, 28 insertions(+), 38 deletions(-) diff --git a/Changelog.draft.rst b/Changelog.draft.rst index fd445ac..2035aa8 100644 --- a/Changelog.draft.rst +++ b/Changelog.draft.rst @@ -16,9 +16,12 @@ New Features Changes ------- +* Maildirs use less memory while syncing. + Bug Fixes --------- +* Saving to Maildirs now checks for file existence without race conditions. Pending for the next major release ================================== diff --git a/offlineimap/folder/Maildir.py b/offlineimap/folder/Maildir.py index a798b36..b576507 100644 --- a/offlineimap/folder/Maildir.py +++ b/offlineimap/folder/Maildir.py @@ -28,6 +28,8 @@ try: except ImportError: from md5 import md5 +from offlineimap import OfflineImapError + uidmatchre = re.compile(',U=(\d+)') flagmatchre = re.compile(':.*2,([A-Z]+)') timestampmatchre = re.compile('(\d+)'); @@ -217,52 +219,37 @@ class MaildirFolder(BaseFolder): # Otherwise, save the message in tmp/ and then call savemessageflags() # to give it a permanent home. tmpdir = os.path.join(self.getfullname(), 'tmp') - messagename = None - attempts = 0 - while 1: - if attempts > 15: - raise IOError, "Couldn't write to file %s" % messagename - timeval, timeseq = gettimeseq() - messagename = '%d_%d.%d.%s,U=%d,FMD5=%s' % \ - (timeval, - timeseq, - os.getpid(), - socket.gethostname(), - uid, - md5(self.getvisiblename()).hexdigest()) - if os.path.exists(os.path.join(tmpdir, messagename)): - time.sleep(2) - attempts += 1 + timeval, timeseq = gettimeseq() + messagename = '%d_%d.%d.%s,U=%d,FMD5=%s' % \ + (timeval, + timeseq, + os.getpid(), + socket.gethostname(), + uid, + md5(self.getvisiblename()).hexdigest()) + # open file and write it out + try: + fd = os.open(os.path.join(tmpdir, messagename), + os.O_EXCL|os.O_CREAT|os.O_WRONLY) + except OSError, e: + if e.errno == 17: + #FILE EXISTS ALREADY + severity = OfflineImapError.ERROR.MESSAGE + raise OfflineImapError("Unique filename %s already existing." %\ + messagename, severity) else: - break - tmpmessagename = messagename.split(',')[0] - self.ui.debug('maildir', 'savemessage: using temporary name %s' %\ - tmpmessagename) - file = open(os.path.join(tmpdir, tmpmessagename), "wt") - file.write(content) + raise + file = os.fdopen(fd, 'wt') + file.write(content) # Make sure the data hits the disk file.flush() if self.dofsync: - os.fsync(file.fileno()) + os.fsync(fd) file.close() if rtime != None: - os.utime(os.path.join(tmpdir, tmpmessagename), (rtime, rtime)) - self.ui.debug('maildir', 'savemessage: moving from %s to %s' % \ - (tmpmessagename, messagename)) - if tmpmessagename != messagename: # then rename it - os.rename(os.path.join(tmpdir, tmpmessagename), - os.path.join(tmpdir, messagename)) - - if self.dofsync: - try: - # fsync the directory (safer semantics in Linux) - fd = os.open(tmpdir, os.O_RDONLY) - os.fsync(fd) - os.close(fd) - except: - pass + os.utime(os.path.join(tmpdir, messagename), (rtime, rtime)) self.messagelist[uid] = {'uid': uid, 'flags': [], 'filename': os.path.join('tmp', messagename)}