PCQRSCANER/venv/Lib/site-packages/textract/parsers/pdf_parser.py
2019-12-22 21:51:47 +01:00

66 lines
2.3 KiB
Python

import os
import shutil
import six
from tempfile import mkdtemp
from ..exceptions import UnknownMethod, ShellError
from .utils import ShellParser
from .image import Parser as TesseractParser
class Parser(ShellParser):
"""Extract text from pdf files using either the ``pdftotext`` method
(default) or the ``pdfminer`` method.
"""
def extract(self, filename, method='', **kwargs):
if method == '' or method == 'pdftotext':
try:
return self.extract_pdftotext(filename, **kwargs)
except ShellError as ex:
# If pdftotext isn't installed and the pdftotext method
# wasn't specified, then gracefully fallback to using
# pdfminer instead.
if method == '' and ex.is_not_installed():
return self.extract_pdfminer(filename, **kwargs)
else:
raise ex
elif method == 'pdfminer':
return self.extract_pdfminer(filename, **kwargs)
elif method == 'tesseract':
return self.extract_tesseract(filename, **kwargs)
else:
raise UnknownMethod(method)
def extract_pdftotext(self, filename, **kwargs):
"""Extract text from pdfs using the pdftotext command line utility."""
if 'layout' in kwargs:
args = ['pdftotext', '-layout', filename, '-']
else:
args = ['pdftotext', filename, '-']
stdout, _ = self.run(args)
return stdout
def extract_pdfminer(self, filename, **kwargs):
"""Extract text from pdfs using pdfminer."""
stdout, _ = self.run(['pdf2txt.py', filename])
return stdout
def extract_tesseract(self, filename, **kwargs):
"""Extract text from pdfs using tesseract (per-page OCR)."""
temp_dir = mkdtemp()
base = os.path.join(temp_dir, 'conv')
contents = []
try:
stdout, _ = self.run(['pdftoppm', filename, base])
for page in sorted(os.listdir(temp_dir)):
page_path = os.path.join(temp_dir, page)
page_content = TesseractParser().extract(page_path, **kwargs)
contents.append(page_content)
return six.b('').join(contents)
finally:
shutil.rmtree(temp_dir)