464 lines
12 KiB
Python
464 lines
12 KiB
Python
#!/usr/bin/env python
|
|
|
|
import shlex
|
|
import string
|
|
import subprocess
|
|
import sys
|
|
from contextlib import contextmanager
|
|
from csv import QUOTE_NONE
|
|
from distutils.version import LooseVersion
|
|
from errno import ENOENT
|
|
from functools import wraps
|
|
from glob import iglob
|
|
from io import BytesIO
|
|
from os import environ, extsep, remove
|
|
from os.path import normcase, normpath, realpath
|
|
from pkgutil import find_loader
|
|
from tempfile import NamedTemporaryFile
|
|
from threading import Timer
|
|
|
|
try:
|
|
from PIL import Image
|
|
except ImportError:
|
|
import Image
|
|
|
|
|
|
tesseract_cmd = 'tesseract'
|
|
|
|
numpy_installed = find_loader('numpy') is not None
|
|
if numpy_installed:
|
|
from numpy import ndarray
|
|
|
|
pandas_installed = find_loader('pandas') is not None
|
|
if pandas_installed:
|
|
import pandas as pd
|
|
|
|
RGB_MODE = 'RGB'
|
|
SUPPORTED_FORMATS = {'JPEG', 'PNG', 'PBM', 'PGM', 'PPM', 'TIFF', 'BMP', 'GIF'}
|
|
|
|
OSD_KEYS = {
|
|
'Page number': ('page_num', int),
|
|
'Orientation in degrees': ('orientation', int),
|
|
'Rotate': ('rotate', int),
|
|
'Orientation confidence': ('orientation_conf', float),
|
|
'Script': ('script', str),
|
|
'Script confidence': ('script_conf', float),
|
|
}
|
|
|
|
|
|
class Output:
|
|
BYTES = 'bytes'
|
|
DATAFRAME = 'data.frame'
|
|
DICT = 'dict'
|
|
STRING = 'string'
|
|
|
|
|
|
class PandasNotSupported(EnvironmentError):
|
|
def __init__(self):
|
|
super(PandasNotSupported, self).__init__('Missing pandas package')
|
|
|
|
|
|
class TesseractError(RuntimeError):
|
|
def __init__(self, status, message):
|
|
self.status = status
|
|
self.message = message
|
|
self.args = (status, message)
|
|
|
|
|
|
class TesseractNotFoundError(EnvironmentError):
|
|
def __init__(self):
|
|
super(TesseractNotFoundError, self).__init__(
|
|
tesseract_cmd + " is not installed or it's not in your path",
|
|
)
|
|
|
|
|
|
class TSVNotSupported(EnvironmentError):
|
|
def __init__(self):
|
|
super(TSVNotSupported, self).__init__(
|
|
'TSV output not supported. Tesseract >= 3.05 required',
|
|
)
|
|
|
|
|
|
def kill(process, code):
|
|
process.kill()
|
|
process.returncode = code
|
|
|
|
|
|
@contextmanager
|
|
def timeout_manager(proc, seconds=0):
|
|
try:
|
|
if not seconds:
|
|
yield proc.communicate()[1]
|
|
return
|
|
|
|
timeout_code = -1
|
|
timer = Timer(seconds, kill, [proc, timeout_code])
|
|
timer.start()
|
|
try:
|
|
_, error_string = proc.communicate()
|
|
yield error_string
|
|
finally:
|
|
timer.cancel()
|
|
if proc.returncode == timeout_code:
|
|
raise RuntimeError('Tesseract process timeout')
|
|
finally:
|
|
proc.stdin.close()
|
|
proc.stdout.close()
|
|
proc.stderr.close()
|
|
|
|
|
|
def run_once(func):
|
|
@wraps(func)
|
|
def wrapper(*args, **kwargs):
|
|
if wrapper._result is wrapper:
|
|
wrapper._result = func(*args, **kwargs)
|
|
return wrapper._result
|
|
|
|
wrapper._result = wrapper
|
|
return wrapper
|
|
|
|
|
|
def get_errors(error_string):
|
|
return u' '.join(
|
|
line for line in error_string.decode('utf-8').splitlines()
|
|
).strip()
|
|
|
|
|
|
def cleanup(temp_name):
|
|
""" Tries to remove temp files by filename wildcard path. """
|
|
for filename in iglob(temp_name + '*' if temp_name else temp_name):
|
|
try:
|
|
remove(filename)
|
|
except OSError as e:
|
|
if e.errno != ENOENT:
|
|
raise e
|
|
|
|
|
|
def prepare(image):
|
|
if numpy_installed and isinstance(image, ndarray):
|
|
image = Image.fromarray(image)
|
|
|
|
if not isinstance(image, Image.Image):
|
|
raise TypeError('Unsupported image object')
|
|
|
|
extension = 'PNG' if not image.format else image.format
|
|
if extension not in SUPPORTED_FORMATS:
|
|
raise TypeError('Unsupported image format/type')
|
|
|
|
if not image.mode.startswith(RGB_MODE):
|
|
image = image.convert(RGB_MODE)
|
|
|
|
if 'A' in image.getbands():
|
|
# discard and replace the alpha channel with white background
|
|
background = Image.new(RGB_MODE, image.size, (255, 255, 255))
|
|
background.paste(image, (0, 0), image)
|
|
image = background
|
|
|
|
image.format = extension
|
|
return image, extension
|
|
|
|
|
|
@contextmanager
|
|
def save(image):
|
|
try:
|
|
with NamedTemporaryFile(prefix='tess_') as f:
|
|
if isinstance(image, str):
|
|
yield f.name, realpath(normpath(normcase(image)))
|
|
return
|
|
|
|
image, extension = prepare(image)
|
|
input_file_name = f.name + extsep + extension
|
|
image.save(input_file_name, format=extension, **image.info)
|
|
yield f.name, input_file_name
|
|
finally:
|
|
cleanup(f.name)
|
|
|
|
|
|
def subprocess_args(include_stdout=True):
|
|
# See https://github.com/pyinstaller/pyinstaller/wiki/Recipe-subprocess
|
|
# for reference and comments.
|
|
|
|
kwargs = {
|
|
'stdin': subprocess.PIPE,
|
|
'stderr': subprocess.PIPE,
|
|
'startupinfo': None,
|
|
'env': environ,
|
|
}
|
|
|
|
if hasattr(subprocess, 'STARTUPINFO'):
|
|
kwargs['startupinfo'] = subprocess.STARTUPINFO()
|
|
kwargs['startupinfo'].dwFlags |= subprocess.STARTF_USESHOWWINDOW
|
|
kwargs['startupinfo'].wShowWindow = subprocess.SW_HIDE
|
|
|
|
if include_stdout:
|
|
kwargs['stdout'] = subprocess.PIPE
|
|
|
|
return kwargs
|
|
|
|
|
|
def run_tesseract(
|
|
input_filename,
|
|
output_filename_base,
|
|
extension,
|
|
lang,
|
|
config='',
|
|
nice=0,
|
|
timeout=0,
|
|
):
|
|
cmd_args = []
|
|
|
|
if not sys.platform.startswith('win32') and nice != 0:
|
|
cmd_args += ('nice', '-n', str(nice))
|
|
|
|
cmd_args += (tesseract_cmd, input_filename, output_filename_base)
|
|
|
|
if lang is not None:
|
|
cmd_args += ('-l', lang)
|
|
|
|
if config:
|
|
cmd_args += shlex.split(config)
|
|
|
|
if extension and extension not in {'box', 'osd', 'tsv'}:
|
|
cmd_args.append(extension)
|
|
|
|
try:
|
|
proc = subprocess.Popen(cmd_args, **subprocess_args())
|
|
except OSError as e:
|
|
if e.errno != ENOENT:
|
|
raise e
|
|
raise TesseractNotFoundError()
|
|
|
|
with timeout_manager(proc, timeout) as error_string:
|
|
if proc.returncode:
|
|
raise TesseractError(proc.returncode, get_errors(error_string))
|
|
|
|
|
|
def run_and_get_output(
|
|
image,
|
|
extension='',
|
|
lang=None,
|
|
config='',
|
|
nice=0,
|
|
timeout=0,
|
|
return_bytes=False,
|
|
):
|
|
|
|
with save(image) as (temp_name, input_filename):
|
|
kwargs = {
|
|
'input_filename': input_filename,
|
|
'output_filename_base': temp_name,
|
|
'extension': extension,
|
|
'lang': lang,
|
|
'config': config,
|
|
'nice': nice,
|
|
'timeout': timeout,
|
|
}
|
|
|
|
run_tesseract(**kwargs)
|
|
filename = kwargs['output_filename_base'] + extsep + extension
|
|
with open(filename, 'rb') as output_file:
|
|
if return_bytes:
|
|
return output_file.read()
|
|
return output_file.read().decode('utf-8').strip()
|
|
|
|
|
|
def file_to_dict(tsv, cell_delimiter, str_col_idx):
|
|
result = {}
|
|
rows = [row.split(cell_delimiter) for row in tsv.split('\n')]
|
|
if not rows:
|
|
return result
|
|
|
|
header = rows.pop(0)
|
|
length = len(header)
|
|
if len(rows[-1]) < length:
|
|
# Fixes bug that occurs when last text string in TSV is null, and
|
|
# last row is missing a final cell in TSV file
|
|
rows[-1].append('')
|
|
|
|
if str_col_idx < 0:
|
|
str_col_idx += length
|
|
|
|
for i, head in enumerate(header):
|
|
result[head] = list()
|
|
for row in rows:
|
|
if len(row) <= i:
|
|
continue
|
|
|
|
val = row[i]
|
|
if row[i].isdigit() and i != str_col_idx:
|
|
val = int(row[i])
|
|
result[head].append(val)
|
|
|
|
return result
|
|
|
|
|
|
def is_valid(val, _type):
|
|
if _type is int:
|
|
return val.isdigit()
|
|
|
|
if _type is float:
|
|
try:
|
|
float(val)
|
|
return True
|
|
except ValueError:
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
def osd_to_dict(osd):
|
|
return {
|
|
OSD_KEYS[kv[0]][0]: OSD_KEYS[kv[0]][1](kv[1])
|
|
for kv in (line.split(': ') for line in osd.split('\n'))
|
|
if len(kv) == 2 and is_valid(kv[1], OSD_KEYS[kv[0]][1])
|
|
}
|
|
|
|
|
|
@run_once
|
|
def get_tesseract_version():
|
|
"""
|
|
Returns LooseVersion object of the Tesseract version
|
|
"""
|
|
try:
|
|
return LooseVersion(
|
|
subprocess.check_output(
|
|
[tesseract_cmd, '--version'], stderr=subprocess.STDOUT,
|
|
)
|
|
.decode('utf-8')
|
|
.split()[1]
|
|
.lstrip(string.printable[10:]),
|
|
)
|
|
except OSError:
|
|
raise TesseractNotFoundError()
|
|
|
|
|
|
def image_to_string(
|
|
image, lang=None, config='', nice=0, output_type=Output.STRING, timeout=0,
|
|
):
|
|
"""
|
|
Returns the result of a Tesseract OCR run on the provided image to string
|
|
"""
|
|
args = [image, 'txt', lang, config, nice, timeout]
|
|
|
|
return {
|
|
Output.BYTES: lambda: run_and_get_output(*(args + [True])),
|
|
Output.DICT: lambda: {'text': run_and_get_output(*args)},
|
|
Output.STRING: lambda: run_and_get_output(*args),
|
|
}[output_type]()
|
|
|
|
|
|
def image_to_pdf_or_hocr(
|
|
image, lang=None, config='', nice=0, extension='pdf', timeout=0,
|
|
):
|
|
"""
|
|
Returns the result of a Tesseract OCR run on the provided image to pdf/hocr
|
|
"""
|
|
|
|
if extension not in {'pdf', 'hocr'}:
|
|
raise ValueError('Unsupported extension: {}'.format(extension))
|
|
args = [image, extension, lang, config, nice, timeout, True]
|
|
|
|
return run_and_get_output(*args)
|
|
|
|
|
|
def image_to_boxes(
|
|
image, lang=None, config='', nice=0, output_type=Output.STRING, timeout=0,
|
|
):
|
|
"""
|
|
Returns string containing recognized characters and their box boundaries
|
|
"""
|
|
config += ' batch.nochop makebox'
|
|
args = [image, 'box', lang, config, nice, timeout]
|
|
|
|
return {
|
|
Output.BYTES: lambda: run_and_get_output(*(args + [True])),
|
|
Output.DICT: lambda: file_to_dict(
|
|
'char left bottom right top page\n' + run_and_get_output(*args),
|
|
' ',
|
|
0,
|
|
),
|
|
Output.STRING: lambda: run_and_get_output(*args),
|
|
}[output_type]()
|
|
|
|
|
|
def get_pandas_output(args, config=None):
|
|
if not pandas_installed:
|
|
raise PandasNotSupported()
|
|
|
|
kwargs = {'quoting': QUOTE_NONE, 'sep': '\t'}
|
|
try:
|
|
kwargs.update(config)
|
|
except (TypeError, ValueError):
|
|
pass
|
|
|
|
return pd.read_csv(BytesIO(run_and_get_output(*args)), **kwargs)
|
|
|
|
|
|
def image_to_data(
|
|
image,
|
|
lang=None,
|
|
config='',
|
|
nice=0,
|
|
output_type=Output.STRING,
|
|
timeout=0,
|
|
pandas_config=None,
|
|
):
|
|
"""
|
|
Returns string containing box boundaries, confidences,
|
|
and other information. Requires Tesseract 3.05+
|
|
"""
|
|
|
|
if get_tesseract_version() < '3.05':
|
|
raise TSVNotSupported()
|
|
|
|
config = '{} {}'.format('-c tessedit_create_tsv=1', config.strip()).strip()
|
|
args = [image, 'tsv', lang, config, nice, timeout]
|
|
|
|
return {
|
|
Output.BYTES: lambda: run_and_get_output(*(args + [True])),
|
|
Output.DATAFRAME: lambda: get_pandas_output(
|
|
args + [True], pandas_config,
|
|
),
|
|
Output.DICT: lambda: file_to_dict(run_and_get_output(*args), '\t', -1),
|
|
Output.STRING: lambda: run_and_get_output(*args),
|
|
}[output_type]()
|
|
|
|
|
|
def image_to_osd(
|
|
image, lang='osd', config='', nice=0, output_type=Output.STRING, timeout=0,
|
|
):
|
|
"""
|
|
Returns string containing the orientation and script detection (OSD)
|
|
"""
|
|
config = '{}-psm 0 {}'.format(
|
|
'' if get_tesseract_version() < '3.05' else '-', config.strip(),
|
|
).strip()
|
|
args = [image, 'osd', lang, config, nice, timeout]
|
|
|
|
return {
|
|
Output.BYTES: lambda: run_and_get_output(*(args + [True])),
|
|
Output.DICT: lambda: osd_to_dict(run_and_get_output(*args)),
|
|
Output.STRING: lambda: run_and_get_output(*args),
|
|
}[output_type]()
|
|
|
|
|
|
def main():
|
|
if len(sys.argv) == 2:
|
|
filename, lang = sys.argv[1], None
|
|
elif len(sys.argv) == 4 and sys.argv[1] == '-l':
|
|
filename, lang = sys.argv[3], sys.argv[2]
|
|
else:
|
|
sys.stderr.write('Usage: pytesseract [-l lang] input_file\n')
|
|
exit(2)
|
|
|
|
try:
|
|
with Image.open(filename) as img:
|
|
print(image_to_string(img, lang=lang))
|
|
except IOError:
|
|
sys.stderr.write('ERROR: Could not open file "%s"\n' % filename)
|
|
exit(1)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|