22 lines
554 B
Python
22 lines
554 B
Python
"""
|
|
Process an image file using tesseract.
|
|
"""
|
|
import os
|
|
|
|
from .utils import ShellParser
|
|
|
|
|
|
class Parser(ShellParser):
|
|
"""Extract text from various image file formats using tesseract-ocr"""
|
|
|
|
def extract(self, filename, **kwargs):
|
|
|
|
# if language given as argument, specify language for tesseract to use
|
|
if 'language' in kwargs:
|
|
args = ['tesseract', filename, 'stdout', '-l', kwargs['language']]
|
|
else:
|
|
args = ['tesseract', filename, 'stdout']
|
|
|
|
stdout, _ = self.run(args)
|
|
return stdout
|