import re import struct import logging import six # Python 2+3 compatibility try: import hashlib as md5 except ImportError: import md5 try: from Crypto.Cipher import ARC4 from Crypto.Cipher import AES from Crypto.Hash import SHA256 except ImportError: AES = SHA256 = None from . import arcfour as ARC4 from .psparser import PSEOF from .psparser import literal_name from .psparser import LIT from .psparser import KWD from . import settings from .pdftypes import PDFException from .pdftypes import PDFTypeError from .pdftypes import PDFStream from .pdftypes import PDFObjectNotFound from .pdftypes import decipher_all from .pdftypes import int_value from .pdftypes import str_value from .pdftypes import list_value from .pdftypes import dict_value from .pdftypes import stream_value from .pdfparser import PDFSyntaxError from .pdfparser import PDFStreamParser from .utils import choplist from .utils import nunpack from .utils import decode_text log = logging.getLogger(__name__) ## Exceptions ## class PDFNoValidXRef(PDFSyntaxError): pass class PDFNoOutlines(PDFException): pass class PDFDestinationNotFound(PDFException): pass class PDFEncryptionError(PDFException): pass class PDFPasswordIncorrect(PDFEncryptionError): pass class PDFTextExtractionNotAllowed(PDFEncryptionError): pass # some predefined literals and keywords. LITERAL_OBJSTM = LIT('ObjStm') LITERAL_XREF = LIT('XRef') LITERAL_CATALOG = LIT('Catalog') ## XRefs ## class PDFBaseXRef(object): def get_trailer(self): raise NotImplementedError def get_objids(self): return [] # Must return # (strmid, index, genno) # or (None, pos, genno) def get_pos(self, objid): raise KeyError(objid) ## PDFXRef ## class PDFXRef(PDFBaseXRef): def __init__(self): self.offsets = {} self.trailer = {} return def __repr__(self): return '' % (self.offsets.keys()) def load(self, parser): while True: try: (pos, line) = parser.nextline() if not line.strip(): continue except PSEOF: raise PDFNoValidXRef('Unexpected EOF - file corrupted?') if not line: raise PDFNoValidXRef('Premature eof: %r' % parser) if line.startswith(b'trailer'): parser.seek(pos) break f = line.strip().split(b' ') if len(f) != 2: raise PDFNoValidXRef('Trailer not found: %r: line=%r' % (parser, line)) try: if six.PY2: (start, nobjs) = map(long, f) else: (start, nobjs) = map(int, f) except ValueError: raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line)) for objid in range(start, start+nobjs): try: (_, line) = parser.nextline() except PSEOF: raise PDFNoValidXRef('Unexpected EOF - file corrupted?') f = line.strip().split(b' ') if len(f) != 3: raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line)) (pos, genno, use) = f if use != b'n': continue self.offsets[objid] = (None, long(pos) if six.PY2 else int(pos), int(genno)) log.info('xref objects: %r', self.offsets) self.load_trailer(parser) return def load_trailer(self, parser): try: (_, kwd) = parser.nexttoken() assert kwd is KWD(b'trailer'), str(kwd) (_, dic) = parser.nextobject() except PSEOF: x = parser.pop(1) if not x: raise PDFNoValidXRef('Unexpected EOF - file corrupted') (_, dic) = x[0] self.trailer.update(dict_value(dic)) log.debug('trailer=%r', self.trailer) return def get_trailer(self): return self.trailer def get_objids(self): return six.iterkeys(self.offsets) def get_pos(self, objid): try: return self.offsets[objid] except KeyError: raise ## PDFXRefFallback ## class PDFXRefFallback(PDFXRef): def __repr__(self): return '' % (self.offsets.keys()) PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b') def load(self, parser): parser.seek(0) while 1: try: (pos, line) = parser.nextline() except PSEOF: break if line.startswith(b'trailer'): parser.seek(pos) self.load_trailer(parser) log.info('trailer: %r', self.trailer) break if six.PY3: line=line.decode('latin-1') #default pdf encoding m = self.PDFOBJ_CUE.match(line) if not m: continue (objid, genno) = m.groups() objid = int(objid) genno = int(genno) self.offsets[objid] = (None, pos, genno) # expand ObjStm. parser.seek(pos) (_, obj) = parser.nextobject() if isinstance(obj, PDFStream) and obj.get('Type') is LITERAL_OBJSTM: stream = stream_value(obj) try: n = stream['N'] except KeyError: if settings.STRICT: raise PDFSyntaxError('N is not defined: %r' % stream) n = 0 parser1 = PDFStreamParser(stream.get_data()) objs = [] try: while 1: (_, obj) = parser1.nextobject() objs.append(obj) except PSEOF: pass n = min(n, len(objs)//2) for index in range(n): objid1 = objs[index*2] self.offsets[objid1] = (objid, index, 0) return ## PDFXRefStream ## class PDFXRefStream(PDFBaseXRef): def __init__(self): self.data = None self.entlen = None self.fl1 = self.fl2 = self.fl3 = None self.ranges = [] return def __repr__(self): return '' % (self.ranges) def load(self, parser): (_, objid) = parser.nexttoken() # ignored (_, genno) = parser.nexttoken() # ignored (_, kwd) = parser.nexttoken() (_, stream) = parser.nextobject() if not isinstance(stream, PDFStream) or stream['Type'] is not LITERAL_XREF: raise PDFNoValidXRef('Invalid PDF stream spec.') size = stream['Size'] index_array = stream.get('Index', (0, size)) if len(index_array) % 2 != 0: raise PDFSyntaxError('Invalid index number') self.ranges.extend(choplist(2, index_array)) (self.fl1, self.fl2, self.fl3) = stream['W'] self.data = stream.get_data() self.entlen = self.fl1+self.fl2+self.fl3 self.trailer = stream.attrs log.info('xref stream: objid=%s, fields=%d,%d,%d', ', '.join(map(repr, self.ranges)), self.fl1, self.fl2, self.fl3) return def get_trailer(self): return self.trailer def get_objids(self): for (start, nobjs) in self.ranges: for i in range(nobjs): offset = self.entlen * i ent = self.data[offset:offset+self.entlen] f1 = nunpack(ent[:self.fl1], 1) if f1 == 1 or f1 == 2: yield start+i return def get_pos(self, objid): index = 0 for (start, nobjs) in self.ranges: if start <= objid and objid < start+nobjs: index += objid - start break else: index += nobjs else: raise KeyError(objid) offset = self.entlen * index ent = self.data[offset:offset+self.entlen] f1 = nunpack(ent[:self.fl1], 1) f2 = nunpack(ent[self.fl1:self.fl1+self.fl2]) f3 = nunpack(ent[self.fl1+self.fl2:]) if f1 == 1: return (None, f2, f3) elif f1 == 2: return (f2, f3, 0) else: # this is a free object raise KeyError(objid) ## PDFSecurityHandler ## class PDFStandardSecurityHandler(object): PASSWORD_PADDING = (b'(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08' b'..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz') supported_revisions = (2, 3) def __init__(self, docid, param, password=''): self.docid = docid self.param = param self.password = password self.init() return def init(self): self.init_params() if self.r not in self.supported_revisions: raise PDFEncryptionError('Unsupported revision: param=%r' % self.param) self.init_key() return def init_params(self): self.v = int_value(self.param.get('V', 0)) self.r = int_value(self.param['R']) self.p = int_value(self.param['P']) self.o = str_value(self.param['O']) self.u = str_value(self.param['U']) self.length = int_value(self.param.get('Length', 40)) return def init_key(self): self.key = self.authenticate(self.password) if self.key is None: raise PDFPasswordIncorrect return def is_printable(self): return bool(self.p & 4) def is_modifiable(self): return bool(self.p & 8) def is_extractable(self): return bool(self.p & 16) def compute_u(self, key): if self.r == 2: # Algorithm 3.4 return ARC4.new(key).encrypt(self.PASSWORD_PADDING) # 2 else: # Algorithm 3.5 hash = md5.md5(self.PASSWORD_PADDING) # 2 hash.update(self.docid[0]) # 3 result = ARC4.new(key).encrypt(hash.digest()) # 4 for i in range(1, 20): # 5 k = b''.join(six.int2byte(c ^ i) for c in six.iterbytes(key)) result = ARC4.new(k).encrypt(result) result += result # 6 return result def compute_encryption_key(self, password): # Algorithm 3.2 password = (password + self.PASSWORD_PADDING)[:32] # 1 hash = md5.md5(password) # 2 hash.update(self.o) # 3 hash.update(struct.pack('= 4: if not self.encrypt_metadata: hash.update(b'\xff\xff\xff\xff') result = hash.digest() n = 5 if self.r >= 3: n = self.length // 8 for _ in range(50): result = md5.md5(result[:n]).digest() return result[:n] def authenticate(self, password): password = password.encode("latin1") key = self.authenticate_user_password(password) if key is None: key = self.authenticate_owner_password(password) return key def authenticate_user_password(self, password): key = self.compute_encryption_key(password) if self.verify_encryption_key(key): return key else: return None def verify_encryption_key(self, key): # Algorithm 3.6 u = self.compute_u(key) if self.r == 2: return u == self.u return u[:16] == self.u[:16] def authenticate_owner_password(self, password): # Algorithm 3.7 password = (password + self.PASSWORD_PADDING)[:32] hash = md5.md5(password) if self.r >= 3: for _ in range(50): hash = md5.md5(hash.digest()) n = 5 if self.r >= 3: n = self.length // 8 key = hash.digest()[:n] if self.r == 2: user_password = ARC4.new(key).decrypt(self.o) else: user_password = self.o for i in range(19, -1, -1): k = b''.join(six.int2byte(c ^ i) for c in six.iterbytes(key)) user_password = ARC4.new(k).decrypt(user_password) return self.authenticate_user_password(user_password) def decrypt(self, objid, genno, data, attrs=None): return self.decrypt_rc4(objid, genno, data) def decrypt_rc4(self, objid, genno, data): key = self.key + struct.pack('