import copy import email.utils import json import logging import re from imapclient.imapclient import decode_utf7 import olefile from email.parser import Parser as EmailParser from extract_msg import constants from extract_msg.attachment import Attachment from extract_msg.compat import os_ as os from extract_msg.properties import Properties from extract_msg.recipient import Recipient from extract_msg.utils import addNumToDir, encode, has_len, stri, windowsUnicode, xstr from extract_msg.exceptions import InvalidFileFormat logger = logging.getLogger(__name__) logger.addHandler(logging.NullHandler()) class Message(olefile.OleFileIO): """ Parser for Microsoft Outlook message files. """ def __init__(self, path, prefix='', attachmentClass=Attachment, filename=None): """ :param path: path to the msg file in the system or is the raw msg file. :param prefix: used for extracting embeded msg files inside the main one. Do not set manually unless you know what you are doing. :param attachmentClass: optional, the class the Message object will use for attachments. You probably should not change this value unless you know what you are doing. :param filename: optional, the filename to be used by default when saving. """ # WARNING DO NOT MANUALLY MODIFY PREFIX. Let the program set it. self.__path = path self.__attachmentClass = attachmentClass try: olefile.OleFileIO.__init__(self, path) except IOError as e: # py2 and py3 compatible logger.error(e) if e.message == 'not an OLE2 structured storage file': raise InvalidFileFormat(e) else: raise prefixl = [] tmp_condition = prefix != '' if tmp_condition: if not isinstance(prefix, stri): try: prefix = '/'.join(prefix) except: raise TypeError('Invalid prefix type: ' + str(type(prefix)) + '\n(This was probably caused by you setting it manually).') prefix = prefix.replace('\\', '/') g = prefix.split("/") if g[-1] == '': g.pop() prefixl = g if prefix[-1] != '/': prefix += '/' self.__prefix = prefix self.__prefixList = prefixl if tmp_condition: filename = self._getStringStream(prefixl[:-1] + ['__substg1.0_3001'], prefix=False) if filename is not None: self.filename = filename elif has_len(path): if len(path) < 1536: self.filename = path else: self.filename = None else: self.filename = None # Initialize properties in the order that is least likely to cause bugs. # TODO have each function check for initialization of needed data so these # lines will be unnecessary. self.mainProperties self.header self.recipients self.attachments self.to self.cc self.sender self.date self.__crlf = '\n' # This variable keeps track of what the new line character should be self.body def listDir(self, streams=True, storages=False): """ Replacement for OleFileIO.listdir that runs at the current prefix directory. """ temp = self.listdir(streams, storages) if self.__prefix == '': return temp prefix = self.__prefix.split('/') if prefix[-1] == '': prefix.pop() out = [] for x in temp: good = True if len(x) <= len(prefix): good = False if good: for y in range(len(prefix)): if x[y] != prefix[y]: good = False if good: out.append(x) return out def Exists(self, inp): """ Checks if :param inp: exists in the msg file. """ inp = self.fix_path(inp) return self.exists(inp) def sExists(self, inp): """ Checks if string stream :param inp: exists in the msg file. """ inp = self.fix_path(inp) return self.exists(inp + '001F') or self.exists(inp + '001E') def fix_path(self, inp, prefix=True): """ Changes paths so that they have the proper prefix (should :param prefix: be True) and are strings rather than lists or tuples. """ if isinstance(inp, (list, tuple)): inp = '/'.join(inp) if prefix: inp = self.__prefix + inp return inp def _getStream(self, filename, prefix=True): filename = self.fix_path(filename, prefix) if self.exists(filename): with self.openstream(filename) as stream: return stream.read() else: logger.info('Stream "{}" was requested but could not be found. Returning `None`.'.format(filename)) return None def _getStringStream(self, filename, prefix=True): """ Gets a string representation of the requested filename. This should ALWAYS return a string (Unicode in python 2) """ filename = self.fix_path(filename, prefix) if self.areStringsUnicode: return windowsUnicode(self._getStream(filename + '001F', prefix = False)) else: tmp = self._getStream(filename + '001E', prefix = False) return None if tmp is None else tmp.decode(self.stringEncoding) @property def path(self): """ Returns the message path if generated from a file, otherwise returns the data used to generate the Message instance. """ return self.__path @property def prefix(self): """ Returns the prefix of the Message instance. Intended for developer use. """ return self.__prefix @property def prefixList(self): """ Returns the prefix list of the Message instance. Intended for developer use. """ return copy.deepcopy(self.__prefixList) @property def subject(self): """ Returns the message subject, if it exists. """ try: return self._subject except AttributeError: self._subject = encode(self._getStringStream('__substg1.0_0037')) return self._subject @property def header(self): """ Returns the message header, if it exists. Otherwise it will generate one. """ try: return self._header except AttributeError: headerText = self._getStringStream('__substg1.0_007D') if headerText is not None: self._header = EmailParser().parsestr(headerText) self._header['date'] = self.date else: logger.info('Header is empty or was not found. Header will be generated from other streams.') header = EmailParser().parsestr('') header.add_header('Date', self.date) header.add_header('From', self.sender) header.add_header('To', self.to) header.add_header('Cc', self.cc) header.add_header('Message-Id', self.message_id) # TODO find authentication results outside of header header.add_header('Authentication-Results', None) self._header = header return self._header @property def header_dict(self): """ Returns a dictionary of the entries in the header """ try: return self._header_dict except AttributeError: self._header_dict = dict(self.header._header) self._header_dict.pop('Received') return self._header_dict def headerInit(self): """ Checks whether the header has been initialized. """ try: self._header return True except AttributeError: return False @property def mainProperties(self): """ Returns the Properties instance used by the Message instance. """ try: return self._prop except AttributeError: self._prop = Properties(self._getStream('__properties_version1.0'), constants.TYPE_MESSAGE if self.__prefix == '' else constants.TYPE_MESSAGE_EMBED) return self._prop @property def date(self): """ Returns the send date, if it exists. """ try: return self._date except AttributeError: self._date = self._prop.date return self._date @property def parsedDate(self): return email.utils.parsedate(self.date) @property def stringEncoding(self): try: return self.__stringEncoding except AttributeError: # We need to calculate the encoding # Let's first check if the encoding will be unicode: if self.areStringsUnicode: self.__stringEncoding = "utf-16-le" return self.__stringEncoding else: # Well, it's not unicode. Now we have to figure out what it IS. if not self.mainProperties.has_key('3FFD0003'): raise Exception('Encoding property not found') enc = self.mainProperties['3FFD0003'].value # Now we just need to translate that value # Now, this next line SHOULD work, but it is possible that it might not... self.__stringEncoding = str(enc) return self.__stringEncoding @property def areStringsUnicode(self): """ Returns a boolean telling if the strings are unicode encoded. """ try: return self.__bStringsUnicode except AttributeError: if self.mainProperties.has_key('340D0003'): if (self.mainProperties['340D0003'].value & 0x40000) != 0: self.__bStringsUnicode = True return self.__bStringsUnicode self.__bStringsUnicode = False return self.__bStringsUnicode @property def sender(self): """ Returns the message sender, if it exists. """ try: return self._sender except AttributeError: # Check header first if self.headerInit(): headerResult = self.header['from'] if headerResult is not None: self._sender = headerResult return headerResult logger.info('Header found, but "sender" is not included. Will be generated from other streams.') # Extract from other fields text = self._getStringStream('__substg1.0_0C1A') email = self._getStringStream('__substg1.0_5D01') # Will not give an email address sometimes. Seems to exclude the email address if YOU are the sender. result = None if text is None: result = email else: result = text if email is not None: result += ' <' + email + '>' self._sender = result return result @property def to(self): """ Returns the to field, if it exists. """ try: return self._to except AttributeError: # Check header first headerResult = None if self.headerInit(): headerResult = self.header['to'] if headerResult is not None: self._to = headerResult else: if self.headerInit(): logger.info('Header found, but "to" is not included. Will be generated from other streams.') f = [] for x in self.recipients: if x.type & 0x0000000f == 1: f.append(x.formatted) if len(f) > 0: st = f[0] if len(f) > 1: for x in range(1, len(f)): st += '; {0}'.format(f[x]) self._to = st else: self._to = None return self._to @property def compressedRtf(self): """ Returns the compressed RTF stream, if it exists. """ try: return self._compressedRtf except AttributeError: self._compressedRtf = self._getStream('__substg1.0_10090102') return self._compressedRtf @property def htmlBody(self): """ Returns the html body, if it exists. """ try: return self._htmlBody except AttributeError: self._htmlBody = self._getStream('__substg1.0_10130102') return self._htmlBody @property def cc(self): """ Returns the cc field, if it exists. """ try: return self._cc except AttributeError: # Check header first headerResult = None if self.headerInit(): headerResult = self.header['cc'] if headerResult is not None: self._cc = headerResult else: if self.headerInit(): logger.info('Header found, but "cc" is not included. Will be generated from other streams.') f = [] for x in self.recipients: if x.type & 0x0000000f == 2: f.append(x.formatted) if len(f) > 0: st = f[0] if len(f) > 1: for x in range(1, len(f)): st += '; {0}'.format(f[x]) self._cc = st else: self._cc = None return self._cc @property def message_id(self): try: return self._message_id except AttributeError: headerResult = None if self.headerInit(): headerResult = self._header['message-id'] if headerResult is not None: self._message_id = headerResult else: if self.headerInit(): logger.info('Header found, but "Message-Id" is not included. Will be generated from other streams.') self._message_id = self._getStringStream('__substg1.0_1035') return self._message_id @property def reply_to(self): try: return self._reply_to except AttributeError: self._reply_to = self._getStringStream('__substg1.0_1042') return self._reply_to @property def body(self): """ Returns the message body, if it exists. """ try: return self._body except AttributeError: self._body = self._getStringStream('__substg1.0_1000') if self._body: self._body = encode(self._body) a = re.search('\n', self._body) if a is not None: if re.search('\r\n', self._body) is not None: self.__crlf = '\r\n' return self._body @property def crlf(self): """ Returns the value of self.__crlf, should you need it for whatever reason. """ self.body return self.__crlf @property def attachmentClass(self): """ Returns the Attachment class being used, should you need to use it externally for whatever reason. """ return self.__attachmentClass @property def attachments(self): """ Returns a list of all attachments. """ try: return self._attachments except AttributeError: # Get the attachments attachmentDirs = [] for dir_ in self.listDir(): if dir_[len(self.__prefixList)].startswith('__attach') and\ dir_[len(self.__prefixList)] not in attachmentDirs: attachmentDirs.append(dir_[len(self.__prefixList)]) self._attachments = [] for attachmentDir in attachmentDirs: self._attachments.append(self.__attachmentClass(self, attachmentDir)) return self._attachments @property def recipients(self): """ Returns a list of all recipients. """ try: return self._recipients except AttributeError: # Get the recipients recipientDirs = [] for dir_ in self.listDir(): if dir_[len(self.__prefixList)].startswith('__recip') and\ dir_[len(self.__prefixList)] not in recipientDirs: recipientDirs.append(dir_[len(self.__prefixList)]) self._recipients = [] for recipientDir in recipientDirs: self._recipients.append(Recipient(recipientDir, self)) return self._recipients def save(self, toJson=False, useFileName=False, raw=False, ContentId=False, customPath=None, customFilename=None): """ Saves the message body and attachments found in the message. Setting toJson to true will output the message body as JSON-formatted text. The body and attachments are stored in a folder. Setting useFileName to true will mean that the filename is used as the name of the folder; otherwise, the message's date and subject are used as the folder name. Here is the absolute order of prioity for the name of the folder: 1. customFilename 2. self.filename if useFileName 3. {date} {subject} """ if customFilename != None and customFilename != '': dirName = customFilename else: if useFileName: # strip out the extension if self.filename is not None: dirName = self.filename.split('/').pop().split('.')[0] else: ValueError( 'Filename must be specified, or path must have been an actual path, to save using filename') else: # Create a directory based on the date and subject of the message d = self.parsedDate if d is not None: dirName = '{0:02d}-{1:02d}-{2:02d}_{3:02d}{4:02d}'.format(*d) else: dirName = 'UnknownDate' if self.subject is None: subject = '[No subject]' else: subject = ''.join(i for i in self.subject if i not in r'\/:*?"<>|') dirName = dirName + ' ' + subject if customPath != None and customPath != '': if customPath[-1] != '/' or customPath[-1] != '\\': customPath += '/' dirName = customPath + dirName try: os.makedirs(dirName) except Exception: newDirName = addNumToDir(dirName) if newDirName is not None: dirName = newDirName else: raise Exception( "Failed to create directory '%s'. Does it already exist?" % dirName ) oldDir = os.getcwdu() try: os.chdir(dirName) # Save the message body fext = 'json' if toJson else 'text' f = open('message.' + fext, 'w') # From, to , cc, subject, date attachmentNames = [] # Save the attachments for attachment in self.attachments: attachmentNames.append(attachment.save(ContentId, toJson)) if toJson: emailObj = {'from': xstr(self.sender), 'to': xstr(self.to), 'cc': xstr(self.cc), 'subject': xstr(self.subject), 'date': xstr(self.date), 'attachments': attachmentNames, 'body': decode_utf7(self.body)} f.write(json.dumps(emailObj, ensure_ascii=True)) else: f.write('From: ' + xstr(self.sender) + self.__crlf) f.write('To: ' + xstr(self.to) + self.__crlf) f.write('CC: ' + xstr(self.cc) + self.__crlf) f.write('Subject: ' + xstr(self.subject) + self.__crlf) f.write('Date: ' + xstr(self.date) + self.__crlf) f.write('-----------------' + self.__crlf + self.__crlf) f.write(self.body) f.close() except Exception as e: self.saveRaw() raise finally: # Return to previous directory os.chdir(oldDir) def saveRaw(self): # Create a 'raw' folder oldDir = os.getcwdu() try: rawDir = 'raw' os.makedirs(rawDir) os.chdir(rawDir) sysRawDir = os.getcwdu() # Loop through all the directories for dir_ in self.listdir(): sysdir = '/'.join(dir_) code = dir_[-1][-8:] if code in constants.PROPERTIES: sysdir = sysdir + ' - ' + constants.PROPERTIES[code] os.makedirs(sysdir) os.chdir(sysdir) # Generate appropriate filename if dir_[-1].endswith('001E'): filename = 'contents.txt' else: filename = 'contents' # Save contents of directory with open(filename, 'wb') as f: f.write(self._getStream(dir_)) # Return to base directory os.chdir(sysRawDir) finally: os.chdir(oldDir) def dump(self): """ Prints out a summary of the message """ print('Message') print('Subject:', self.subject) print('Date:', self.date) print('Body:') print(self.body) def debug(self): for dir_ in self.listDir(): if dir_[-1].endswith('001E') or dir_[-1].endswith('001F'): print('Directory: ' + str(dir_[:-1])) print('Contents: {}'.format(self._getStream(dir_))) def save_attachments(self, contentId=False, json=False, useFileName=False, raw=False, customPath=None): """ Saves only attachments in the same folder. """ for attachment in self.attachments: attachment.save(contentId, json, useFileName, raw, customPath)