PCQRSCANER/venv/Lib/site-packages/pdfminer/pdfdocument.py
2019-12-22 21:51:47 +01:00

817 lines
27 KiB
Python

import re
import struct
import logging
import six # Python 2+3 compatibility
try:
import hashlib as md5
except ImportError:
import md5
try:
from Crypto.Cipher import ARC4
from Crypto.Cipher import AES
from Crypto.Hash import SHA256
except ImportError:
AES = SHA256 = None
from . import arcfour as ARC4
from .psparser import PSEOF
from .psparser import literal_name
from .psparser import LIT
from .psparser import KWD
from . import settings
from .pdftypes import PDFException
from .pdftypes import PDFTypeError
from .pdftypes import PDFStream
from .pdftypes import PDFObjectNotFound
from .pdftypes import decipher_all
from .pdftypes import int_value
from .pdftypes import str_value
from .pdftypes import list_value
from .pdftypes import dict_value
from .pdftypes import stream_value
from .pdfparser import PDFSyntaxError
from .pdfparser import PDFStreamParser
from .utils import choplist
from .utils import nunpack
from .utils import decode_text
log = logging.getLogger(__name__)
## Exceptions
##
class PDFNoValidXRef(PDFSyntaxError):
pass
class PDFNoOutlines(PDFException):
pass
class PDFDestinationNotFound(PDFException):
pass
class PDFEncryptionError(PDFException):
pass
class PDFPasswordIncorrect(PDFEncryptionError):
pass
class PDFTextExtractionNotAllowed(PDFEncryptionError):
pass
# some predefined literals and keywords.
LITERAL_OBJSTM = LIT('ObjStm')
LITERAL_XREF = LIT('XRef')
LITERAL_CATALOG = LIT('Catalog')
## XRefs
##
class PDFBaseXRef(object):
def get_trailer(self):
raise NotImplementedError
def get_objids(self):
return []
# Must return
# (strmid, index, genno)
# or (None, pos, genno)
def get_pos(self, objid):
raise KeyError(objid)
## PDFXRef
##
class PDFXRef(PDFBaseXRef):
def __init__(self):
self.offsets = {}
self.trailer = {}
return
def __repr__(self):
return '<PDFXRef: offsets=%r>' % (self.offsets.keys())
def load(self, parser):
while True:
try:
(pos, line) = parser.nextline()
if not line.strip():
continue
except PSEOF:
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
if not line:
raise PDFNoValidXRef('Premature eof: %r' % parser)
if line.startswith(b'trailer'):
parser.seek(pos)
break
f = line.strip().split(b' ')
if len(f) != 2:
raise PDFNoValidXRef('Trailer not found: %r: line=%r' % (parser, line))
try:
if six.PY2:
(start, nobjs) = map(long, f)
else:
(start, nobjs) = map(int, f)
except ValueError:
raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line))
for objid in range(start, start+nobjs):
try:
(_, line) = parser.nextline()
except PSEOF:
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
f = line.strip().split(b' ')
if len(f) != 3:
raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line))
(pos, genno, use) = f
if use != b'n':
continue
self.offsets[objid] = (None, long(pos) if six.PY2 else int(pos), int(genno))
log.info('xref objects: %r', self.offsets)
self.load_trailer(parser)
return
def load_trailer(self, parser):
try:
(_, kwd) = parser.nexttoken()
assert kwd is KWD(b'trailer'), str(kwd)
(_, dic) = parser.nextobject()
except PSEOF:
x = parser.pop(1)
if not x:
raise PDFNoValidXRef('Unexpected EOF - file corrupted')
(_, dic) = x[0]
self.trailer.update(dict_value(dic))
log.debug('trailer=%r', self.trailer)
return
def get_trailer(self):
return self.trailer
def get_objids(self):
return six.iterkeys(self.offsets)
def get_pos(self, objid):
try:
return self.offsets[objid]
except KeyError:
raise
## PDFXRefFallback
##
class PDFXRefFallback(PDFXRef):
def __repr__(self):
return '<PDFXRefFallback: offsets=%r>' % (self.offsets.keys())
PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
def load(self, parser):
parser.seek(0)
while 1:
try:
(pos, line) = parser.nextline()
except PSEOF:
break
if line.startswith(b'trailer'):
parser.seek(pos)
self.load_trailer(parser)
log.info('trailer: %r', self.trailer)
break
if six.PY3:
line=line.decode('latin-1') #default pdf encoding
m = self.PDFOBJ_CUE.match(line)
if not m:
continue
(objid, genno) = m.groups()
objid = int(objid)
genno = int(genno)
self.offsets[objid] = (None, pos, genno)
# expand ObjStm.
parser.seek(pos)
(_, obj) = parser.nextobject()
if isinstance(obj, PDFStream) and obj.get('Type') is LITERAL_OBJSTM:
stream = stream_value(obj)
try:
n = stream['N']
except KeyError:
if settings.STRICT:
raise PDFSyntaxError('N is not defined: %r' % stream)
n = 0
parser1 = PDFStreamParser(stream.get_data())
objs = []
try:
while 1:
(_, obj) = parser1.nextobject()
objs.append(obj)
except PSEOF:
pass
n = min(n, len(objs)//2)
for index in range(n):
objid1 = objs[index*2]
self.offsets[objid1] = (objid, index, 0)
return
## PDFXRefStream
##
class PDFXRefStream(PDFBaseXRef):
def __init__(self):
self.data = None
self.entlen = None
self.fl1 = self.fl2 = self.fl3 = None
self.ranges = []
return
def __repr__(self):
return '<PDFXRefStream: ranges=%r>' % (self.ranges)
def load(self, parser):
(_, objid) = parser.nexttoken() # ignored
(_, genno) = parser.nexttoken() # ignored
(_, kwd) = parser.nexttoken()
(_, stream) = parser.nextobject()
if not isinstance(stream, PDFStream) or stream['Type'] is not LITERAL_XREF:
raise PDFNoValidXRef('Invalid PDF stream spec.')
size = stream['Size']
index_array = stream.get('Index', (0, size))
if len(index_array) % 2 != 0:
raise PDFSyntaxError('Invalid index number')
self.ranges.extend(choplist(2, index_array))
(self.fl1, self.fl2, self.fl3) = stream['W']
self.data = stream.get_data()
self.entlen = self.fl1+self.fl2+self.fl3
self.trailer = stream.attrs
log.info('xref stream: objid=%s, fields=%d,%d,%d',
', '.join(map(repr, self.ranges)),
self.fl1, self.fl2, self.fl3)
return
def get_trailer(self):
return self.trailer
def get_objids(self):
for (start, nobjs) in self.ranges:
for i in range(nobjs):
offset = self.entlen * i
ent = self.data[offset:offset+self.entlen]
f1 = nunpack(ent[:self.fl1], 1)
if f1 == 1 or f1 == 2:
yield start+i
return
def get_pos(self, objid):
index = 0
for (start, nobjs) in self.ranges:
if start <= objid and objid < start+nobjs:
index += objid - start
break
else:
index += nobjs
else:
raise KeyError(objid)
offset = self.entlen * index
ent = self.data[offset:offset+self.entlen]
f1 = nunpack(ent[:self.fl1], 1)
f2 = nunpack(ent[self.fl1:self.fl1+self.fl2])
f3 = nunpack(ent[self.fl1+self.fl2:])
if f1 == 1:
return (None, f2, f3)
elif f1 == 2:
return (f2, f3, 0)
else:
# this is a free object
raise KeyError(objid)
## PDFSecurityHandler
##
class PDFStandardSecurityHandler(object):
PASSWORD_PADDING = (b'(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08'
b'..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz')
supported_revisions = (2, 3)
def __init__(self, docid, param, password=''):
self.docid = docid
self.param = param
self.password = password
self.init()
return
def init(self):
self.init_params()
if self.r not in self.supported_revisions:
raise PDFEncryptionError('Unsupported revision: param=%r' % self.param)
self.init_key()
return
def init_params(self):
self.v = int_value(self.param.get('V', 0))
self.r = int_value(self.param['R'])
self.p = int_value(self.param['P'])
self.o = str_value(self.param['O'])
self.u = str_value(self.param['U'])
self.length = int_value(self.param.get('Length', 40))
return
def init_key(self):
self.key = self.authenticate(self.password)
if self.key is None:
raise PDFPasswordIncorrect
return
def is_printable(self):
return bool(self.p & 4)
def is_modifiable(self):
return bool(self.p & 8)
def is_extractable(self):
return bool(self.p & 16)
def compute_u(self, key):
if self.r == 2:
# Algorithm 3.4
return ARC4.new(key).encrypt(self.PASSWORD_PADDING) # 2
else:
# Algorithm 3.5
hash = md5.md5(self.PASSWORD_PADDING) # 2
hash.update(self.docid[0]) # 3
result = ARC4.new(key).encrypt(hash.digest()) # 4
for i in range(1, 20): # 5
k = b''.join(six.int2byte(c ^ i) for c in six.iterbytes(key))
result = ARC4.new(k).encrypt(result)
result += result # 6
return result
def compute_encryption_key(self, password):
# Algorithm 3.2
password = (password + self.PASSWORD_PADDING)[:32] # 1
hash = md5.md5(password) # 2
hash.update(self.o) # 3
hash.update(struct.pack('<l', self.p)) # 4
hash.update(self.docid[0]) # 5
if self.r >= 4:
if not self.encrypt_metadata:
hash.update(b'\xff\xff\xff\xff')
result = hash.digest()
n = 5
if self.r >= 3:
n = self.length // 8
for _ in range(50):
result = md5.md5(result[:n]).digest()
return result[:n]
def authenticate(self, password):
password = password.encode("latin1")
key = self.authenticate_user_password(password)
if key is None:
key = self.authenticate_owner_password(password)
return key
def authenticate_user_password(self, password):
key = self.compute_encryption_key(password)
if self.verify_encryption_key(key):
return key
else:
return None
def verify_encryption_key(self, key):
# Algorithm 3.6
u = self.compute_u(key)
if self.r == 2:
return u == self.u
return u[:16] == self.u[:16]
def authenticate_owner_password(self, password):
# Algorithm 3.7
password = (password + self.PASSWORD_PADDING)[:32]
hash = md5.md5(password)
if self.r >= 3:
for _ in range(50):
hash = md5.md5(hash.digest())
n = 5
if self.r >= 3:
n = self.length // 8
key = hash.digest()[:n]
if self.r == 2:
user_password = ARC4.new(key).decrypt(self.o)
else:
user_password = self.o
for i in range(19, -1, -1):
k = b''.join(six.int2byte(c ^ i) for c in six.iterbytes(key))
user_password = ARC4.new(k).decrypt(user_password)
return self.authenticate_user_password(user_password)
def decrypt(self, objid, genno, data, attrs=None):
return self.decrypt_rc4(objid, genno, data)
def decrypt_rc4(self, objid, genno, data):
key = self.key + struct.pack('<L', objid)[:3] + struct.pack('<L', genno)[:2]
hash = md5.md5(key)
key = hash.digest()[:min(len(key), 16)]
return ARC4.new(key).decrypt(data)
class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
supported_revisions = (4,)
def init_params(self):
super(PDFStandardSecurityHandlerV4, self).init_params()
self.length = 128
self.cf = dict_value(self.param.get('CF'))
self.stmf = literal_name(self.param['StmF'])
self.strf = literal_name(self.param['StrF'])
self.encrypt_metadata = bool(self.param.get('EncryptMetadata', True))
if self.stmf != self.strf:
raise PDFEncryptionError('Unsupported crypt filter: param=%r' % self.param)
self.cfm = {}
for k, v in self.cf.items():
f = self.get_cfm(literal_name(v['CFM']))
if f is None:
raise PDFEncryptionError('Unknown crypt filter method: param=%r' % self.param)
self.cfm[k] = f
self.cfm['Identity'] = self.decrypt_identity
if self.strf not in self.cfm:
raise PDFEncryptionError('Undefined crypt filter: param=%r' % self.param)
return
def get_cfm(self, name):
if name == 'V2':
return self.decrypt_rc4
elif name == 'AESV2':
return self.decrypt_aes128
else:
return None
def decrypt(self, objid, genno, data, attrs=None, name=None):
if not self.encrypt_metadata and attrs is not None:
t = attrs.get('Type')
if t is not None and literal_name(t) == 'Metadata':
return data
if name is None:
name = self.strf
return self.cfm[name](objid, genno, data)
def decrypt_identity(self, objid, genno, data):
return data
def decrypt_aes128(self, objid, genno, data):
key = self.key + struct.pack('<L', objid)[:3] + struct.pack('<L', genno)[:2] + b'sAlT'
hash = md5.md5(key)
key = hash.digest()[:min(len(key), 16)]
return AES.new(key, mode=AES.MODE_CBC, IV=data[:16]).decrypt(data[16:])
class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
supported_revisions = (5,)
def init_params(self):
super(PDFStandardSecurityHandlerV5, self).init_params()
self.length = 256
self.oe = str_value(self.param['OE'])
self.ue = str_value(self.param['UE'])
self.o_hash = self.o[:32]
self.o_validation_salt = self.o[32:40]
self.o_key_salt = self.o[40:]
self.u_hash = self.u[:32]
self.u_validation_salt = self.u[32:40]
self.u_key_salt = self.u[40:]
return
def get_cfm(self, name):
if name == 'AESV3':
return self.decrypt_aes256
else:
return None
def authenticate(self, password):
password = password.encode('utf-8')[:127]
hash = SHA256.new(password)
hash.update(self.o_validation_salt)
hash.update(self.u)
if hash.digest() == self.o_hash:
hash = SHA256.new(password)
hash.update(self.o_key_salt)
hash.update(self.u)
return AES.new(hash.digest(), mode=AES.MODE_CBC, IV=b'\x00' * 16).decrypt(self.oe)
hash = SHA256.new(password)
hash.update(self.u_validation_salt)
if hash.digest() == self.u_hash:
hash = SHA256.new(password)
hash.update(self.u_key_salt)
return AES.new(hash.digest(), mode=AES.MODE_CBC, IV=b'\x00' * 16).decrypt(self.ue)
return None
def decrypt_aes256(self, objid, genno, data):
return AES.new(self.key, mode=AES.MODE_CBC, IV=data[:16]).decrypt(data[16:])
## PDFDocument
##
class PDFDocument(object):
"""PDFDocument object represents a PDF document.
Since a PDF file can be very big, normally it is not loaded at
once. So PDF document has to cooperate with a PDF parser in order to
dynamically import the data as processing goes.
Typical usage:
doc = PDFDocument(parser, password)
obj = doc.getobj(objid)
"""
security_handler_registry = {
1: PDFStandardSecurityHandler,
2: PDFStandardSecurityHandler,
}
if AES is not None:
security_handler_registry[4] = PDFStandardSecurityHandlerV4
if SHA256 is not None:
security_handler_registry[5] = PDFStandardSecurityHandlerV5
def __init__(self, parser, password='', caching=True, fallback=True):
"Set the document to use a given PDFParser object."
self.caching = caching
self.xrefs = []
self.info = []
self.catalog = None
self.encryption = None
self.decipher = None
self._parser = None
self._cached_objs = {}
self._parsed_objs = {}
self._parser = parser
self._parser.set_document(self)
self.is_printable = self.is_modifiable = self.is_extractable = True
# Retrieve the information of each header that was appended
# (maybe multiple times) at the end of the document.
try:
pos = self.find_xref(parser)
self.read_xref_from(parser, pos, self.xrefs)
except PDFNoValidXRef:
pass # fallback = True
if fallback:
parser.fallback = True
xref = PDFXRefFallback()
xref.load(parser)
self.xrefs.append(xref)
for xref in self.xrefs:
trailer = xref.get_trailer()
if not trailer:
continue
# If there's an encryption info, remember it.
if 'Encrypt' in trailer:
#assert not self.encryption, str(self.encryption)
self.encryption = (list_value(trailer['ID']),
dict_value(trailer['Encrypt']))
self._initialize_password(password)
if 'Info' in trailer:
self.info.append(dict_value(trailer['Info']))
if 'Root' in trailer:
# Every PDF file must have exactly one /Root dictionary.
self.catalog = dict_value(trailer['Root'])
break
else:
raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
if self.catalog.get('Type') is not LITERAL_CATALOG:
if settings.STRICT:
raise PDFSyntaxError('Catalog not found!')
return
KEYWORD_OBJ = KWD(b'obj')
# _initialize_password(password=b'')
# Perform the initialization with a given password.
def _initialize_password(self, password=''):
(docid, param) = self.encryption
if literal_name(param.get('Filter')) != 'Standard':
raise PDFEncryptionError('Unknown filter: param=%r' % param)
v = int_value(param.get('V', 0))
factory = self.security_handler_registry.get(v)
if factory is None:
raise PDFEncryptionError('Unknown algorithm: param=%r' % param)
handler = factory(docid, param, password)
self.decipher = handler.decrypt
self.is_printable = handler.is_printable()
self.is_modifiable = handler.is_modifiable()
self.is_extractable = handler.is_extractable()
self._parser.fallback = False # need to read streams with exact length
return
def _getobj_objstm(self, stream, index, objid):
if stream.objid in self._parsed_objs:
(objs, n) = self._parsed_objs[stream.objid]
else:
(objs, n) = self._get_objects(stream)
if self.caching:
self._parsed_objs[stream.objid] = (objs, n)
i = n*2+index
try:
obj = objs[i]
except IndexError:
raise PDFSyntaxError('index too big: %r' % index)
return obj
def _get_objects(self, stream):
if stream.get('Type') is not LITERAL_OBJSTM:
if settings.STRICT:
raise PDFSyntaxError('Not a stream object: %r' % stream)
try:
n = stream['N']
except KeyError:
if settings.STRICT:
raise PDFSyntaxError('N is not defined: %r' % stream)
n = 0
parser = PDFStreamParser(stream.get_data())
parser.set_document(self)
objs = []
try:
while 1:
(_, obj) = parser.nextobject()
objs.append(obj)
except PSEOF:
pass
return (objs, n)
def _getobj_parse(self, pos, objid):
self._parser.seek(pos)
(_, objid1) = self._parser.nexttoken() # objid
(_, genno) = self._parser.nexttoken() # genno
(_, kwd) = self._parser.nexttoken()
# #### hack around malformed pdf files
# copied from https://github.com/jaepil/pdfminer3k/blob/master/pdfminer/pdfparser.py#L399
#to solve https://github.com/pdfminer/pdfminer.six/issues/56
#assert objid1 == objid, str((objid1, objid))
if objid1 != objid:
x = []
while kwd is not self.KEYWORD_OBJ:
(_,kwd) = self._parser.nexttoken()
x.append(kwd)
if x:
objid1 = x[-2]
genno = x[-1]
# #### end hack around malformed pdf files
if objid1 != objid:
raise PDFSyntaxError('objid mismatch: %r=%r' % (objid1, objid))
if kwd != KWD(b'obj'):
raise PDFSyntaxError('Invalid object spec: offset=%r' % pos)
(_, obj) = self._parser.nextobject()
return obj
# can raise PDFObjectNotFound
def getobj(self, objid):
assert objid != 0
if not self.xrefs:
raise PDFException('PDFDocument is not initialized')
log.debug('getobj: objid=%r', objid)
if objid in self._cached_objs:
(obj, genno) = self._cached_objs[objid]
else:
for xref in self.xrefs:
try:
(strmid, index, genno) = xref.get_pos(objid)
except KeyError:
continue
try:
if strmid is not None:
stream = stream_value(self.getobj(strmid))
obj = self._getobj_objstm(stream, index, objid)
else:
obj = self._getobj_parse(index, objid)
if self.decipher:
obj = decipher_all(self.decipher, objid, genno, obj)
if isinstance(obj, PDFStream):
obj.set_objid(objid, genno)
break
except (PSEOF, PDFSyntaxError):
continue
else:
raise PDFObjectNotFound(objid)
log.debug('register: objid=%r: %r', objid, obj)
if self.caching:
self._cached_objs[objid] = (obj, genno)
return obj
def get_outlines(self):
if 'Outlines' not in self.catalog:
raise PDFNoOutlines
def search(entry, level):
entry = dict_value(entry)
if 'Title' in entry:
if 'A' in entry or 'Dest' in entry:
title = decode_text(str_value(entry['Title']))
dest = entry.get('Dest')
action = entry.get('A')
se = entry.get('SE')
yield (level, title, dest, action, se)
if 'First' in entry and 'Last' in entry:
for x in search(entry['First'], level+1):
yield x
if 'Next' in entry:
for x in search(entry['Next'], level):
yield x
return
return search(self.catalog['Outlines'], 0)
def lookup_name(self, cat, key):
try:
names = dict_value(self.catalog['Names'])
except (PDFTypeError, KeyError):
raise KeyError((cat, key))
# may raise KeyError
d0 = dict_value(names[cat])
def lookup(d):
if 'Limits' in d:
(k1, k2) = list_value(d['Limits'])
if key < k1 or k2 < key:
return None
if 'Names' in d:
objs = list_value(d['Names'])
names = dict(choplist(2, objs))
return names[key]
if 'Kids' in d:
for c in list_value(d['Kids']):
v = lookup(dict_value(c))
if v:
return v
raise KeyError((cat, key))
return lookup(d0)
def get_dest(self, name):
try:
# PDF-1.2 or later
obj = self.lookup_name('Dests', name)
except KeyError:
# PDF-1.1 or prior
if 'Dests' not in self.catalog:
raise PDFDestinationNotFound(name)
d0 = dict_value(self.catalog['Dests'])
if name not in d0:
raise PDFDestinationNotFound(name)
obj = d0[name]
return obj
# find_xref
def find_xref(self, parser):
"""Internal function used to locate the first XRef."""
# search the last xref table by scanning the file backwards.
prev = None
for line in parser.revreadlines():
line = line.strip()
log.debug('find_xref: %r', line)
if line == b'startxref':
break
if line:
prev = line
else:
raise PDFNoValidXRef('Unexpected EOF')
log.info('xref found: pos=%r', prev)
return long(prev) if six.PY2 else int(prev)
# read xref table
def read_xref_from(self, parser, start, xrefs):
"""Reads XRefs from the given location."""
parser.seek(start)
parser.reset()
try:
(pos, token) = parser.nexttoken()
except PSEOF:
raise PDFNoValidXRef('Unexpected EOF')
log.info('read_xref_from: start=%d, token=%r', start, token)
if isinstance(token, int):
# XRefStream: PDF-1.5
parser.seek(pos)
parser.reset()
xref = PDFXRefStream()
xref.load(parser)
else:
if token is parser.KEYWORD_XREF:
parser.nextline()
xref = PDFXRef()
xref.load(parser)
xrefs.append(xref)
trailer = xref.get_trailer()
log.info('trailer: %r', trailer)
if 'XRefStm' in trailer:
pos = int_value(trailer['XRefStm'])
self.read_xref_from(parser, pos, xrefs)
if 'Prev' in trailer:
# find previous xref
pos = int_value(trailer['Prev'])
self.read_xref_from(parser, pos, xrefs)
return