Included charset detection

This patch includes charset detection to read the message.

This patch is related to issue #43

Signed-off-by: Rodolfo García Peñas (kix) <kix@kix.es>
This commit is contained in:
Rodolfo García Peñas (kix) 2021-02-19 16:39:17 +01:00
parent 76c7a723db
commit 62490ff183
2 changed files with 17 additions and 3 deletions

View File

@ -20,9 +20,11 @@ import socket
import time import time
import re import re
import os import os
from pathlib import Path
from sys import exc_info from sys import exc_info
from threading import Lock from threading import Lock
from hashlib import md5 from hashlib import md5
import chardet
from offlineimap import OfflineImapError, emailutil from offlineimap import OfflineImapError, emailutil
from .Base import BaseFolder from .Base import BaseFolder
@ -256,11 +258,18 @@ class MaildirFolder(BaseFolder):
def getmessage(self, uid): def getmessage(self, uid):
"""Return the content of the message.""" """Return the content of the message."""
# TODO: Perhaps, force the encoding using config file
filename = self.messagelist[uid]['filename'] filename = self.messagelist[uid]['filename']
filepath = os.path.join(self.getfullname(), filename) filepath = os.path.join(self.getfullname(), filename)
file = open(filepath, 'rt') # Open the file as binary and read it
retval = file.read() file = Path(filepath)
file.close() blob = file.read_bytes()
# Detect the encoding
detection = chardet.detect(blob)
encoding = detection["encoding"]
# Read the file as text
retval = blob.decode(encoding)
# TODO: WHY are we replacing \r\n with \n here? And why do we # TODO: WHY are we replacing \r\n with \n here? And why do we
# read it as text? # read it as text?
return retval.replace("\r\n", "\n") return retval.replace("\r\n", "\n")

View File

@ -3,3 +3,8 @@ gssapi[kerberos]
portalocker[cygwin] portalocker[cygwin]
rfc6555 rfc6555
distro distro
imaplib2~=3.5
urllib3~=1.25.9
certifi~=2020.6.20
chardet~=3.0.4