844 lines
32 KiB
Python
844 lines
32 KiB
Python
|
#!/usr/bin/env python
|
||
|
# encoding: utf-8
|
||
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||
|
# contributor license agreements. See the NOTICE file distributed with
|
||
|
# this work for additional information regarding copyright ownership.
|
||
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||
|
# (the "License"); you may not use this file except in compliance with
|
||
|
# the License. You may obtain a copy of the License at
|
||
|
#
|
||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
#
|
||
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
# See the License for the specific language governing permissions and
|
||
|
# limitations under the License.
|
||
|
#
|
||
|
|
||
|
# Module documentation
|
||
|
'''
|
||
|
Tika Python module provides Python API client to Aapche Tika Server.
|
||
|
|
||
|
**Example usage**::
|
||
|
|
||
|
import tika
|
||
|
from tika import parser
|
||
|
parsed = parser.from_file('/path/to/file')
|
||
|
print(parsed["metadata"])
|
||
|
print(parsed["content"])
|
||
|
|
||
|
Visit https://github.com/chrismattmann/tika-python to learn more about it.
|
||
|
|
||
|
**Detect IANA MIME Type**::
|
||
|
|
||
|
from tika import detector
|
||
|
print(detector.from_file('/path/to/file'))
|
||
|
|
||
|
**Detect Language**::
|
||
|
|
||
|
from tika import language
|
||
|
print(language.from_file('/path/to/file'))
|
||
|
|
||
|
**Use Tika Translate**::
|
||
|
|
||
|
from tika import translate
|
||
|
print(translate.from_file('/path/to/file', 'srcLang', 'destLang')
|
||
|
# Use auto Language detection feature
|
||
|
print(translate.from_file('/path/to/file', 'destLang')
|
||
|
|
||
|
***Tika-Python Configuration***
|
||
|
You can now use custom configuration files. See https://tika.apache.org/1.18/configuring.html
|
||
|
for details on writing configuration files. Configuration is set the first time the server is started.
|
||
|
To use a configuration file with a parser, or detector:
|
||
|
parsed = parser.from_file('/path/to/file', config_path='/path/to/configfile')
|
||
|
or:
|
||
|
detected = detector.from_file('/path/to/file', config_path='/path/to/configfile')
|
||
|
or:
|
||
|
detected = detector.from_buffer('some buffered content', config_path='/path/to/configfile')
|
||
|
|
||
|
'''
|
||
|
|
||
|
USAGE = """
|
||
|
tika.py [-v] [-e] [-o <outputDir>] [--server <TikaServerEndpoint>] [--install <UrlToTikaServerJar>] [--port <portNumber>] <command> <option> <urlOrPathToFile>
|
||
|
|
||
|
tika.py parse all test.pdf test2.pdf (write output JSON metadata files for test1.pdf_meta.json and test2.pdf_meta.json)
|
||
|
tika.py detect type test.pdf (returns mime-type as text/plain)
|
||
|
tika.py language file french.txt (returns language e.g., fr as text/plain)
|
||
|
tika.py translate fr:en french.txt (translates the file french.txt from french to english)
|
||
|
tika.py config mime-types (see what mime-types the Tika Server can handle)
|
||
|
|
||
|
A simple python and command-line client for Tika using the standalone Tika server (JAR file).
|
||
|
All commands return results in JSON format by default (except text in text/plain).
|
||
|
|
||
|
To parse docs, use:
|
||
|
tika.py parse <meta | text | all> <path>
|
||
|
|
||
|
To check the configuration of the Tika server, use:
|
||
|
tika.py config <mime-types | detectors | parsers>
|
||
|
|
||
|
Commands:
|
||
|
parse = parse the input file and write a JSON doc file.ext_meta.json containing the extracted metadata, text, or both
|
||
|
detect type = parse the stream and 'detect' the MIME/media type, return in text/plain
|
||
|
language file = parse the file stream and identify the language of the text, return its 2 character code in text/plain
|
||
|
translate src:dest = parse and extract text and then translate the text from source language to destination language
|
||
|
config = return a JSON doc describing the configuration of the Tika server (i.e. mime-types it
|
||
|
can handle, or installed detectors or parsers)
|
||
|
|
||
|
Arguments:
|
||
|
urlOrPathToFile = file to be parsed, if URL it will first be retrieved and then passed to Tika
|
||
|
|
||
|
Switches:
|
||
|
--verbose, -v = verbose mode
|
||
|
--encode, -e = encode response in UTF-8
|
||
|
--csv, -c = report detect output in comma-delimited format
|
||
|
--server <TikaServerEndpoint> = use a remote Tika Server at this endpoint, otherwise use local server
|
||
|
--install <UrlToTikaServerJar> = download and exec Tika Server (JAR file), starting server on default port 9998
|
||
|
|
||
|
Example usage as python client:
|
||
|
-- from tika import runCommand, parse1
|
||
|
-- jsonOutput = runCommand('parse', 'all', filename)
|
||
|
or
|
||
|
-- jsonOutput = parse1('all', filename)
|
||
|
|
||
|
"""
|
||
|
|
||
|
import sys, os, getopt, time, codecs, re
|
||
|
try:
|
||
|
unicode_string = unicode
|
||
|
binary_string = str
|
||
|
except NameError:
|
||
|
unicode_string = str
|
||
|
binary_string = bytes
|
||
|
|
||
|
try:
|
||
|
from urllib import urlretrieve
|
||
|
except ImportError:
|
||
|
from urllib.request import urlretrieve
|
||
|
try:
|
||
|
from urlparse import urlparse
|
||
|
except ImportError:
|
||
|
from urllib.parse import urlparse as urlparse
|
||
|
|
||
|
try:
|
||
|
from rfc6266 import build_header
|
||
|
def make_content_disposition_header(fn):
|
||
|
return build_header(os.path.basename(fn)).decode('ascii')
|
||
|
except ImportError:
|
||
|
def make_content_disposition_header(fn):
|
||
|
return 'attachment; filename=%s' % os.path.basename(fn)
|
||
|
|
||
|
if sys.version_info[0] < 3:
|
||
|
open = codecs.open
|
||
|
|
||
|
import requests
|
||
|
import socket
|
||
|
import tempfile
|
||
|
import hashlib
|
||
|
import platform
|
||
|
from subprocess import Popen
|
||
|
from subprocess import STDOUT
|
||
|
from os import walk
|
||
|
import logging
|
||
|
|
||
|
log_path = os.getenv('TIKA_LOG_PATH', tempfile.gettempdir())
|
||
|
log_file = os.path.join(log_path, 'tika.log')
|
||
|
|
||
|
logFormatter = logging.Formatter("%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s] %(message)s")
|
||
|
log = logging.getLogger('tika.tika')
|
||
|
|
||
|
# File logs
|
||
|
fileHandler = logging.FileHandler(log_file)
|
||
|
fileHandler.setFormatter(logFormatter)
|
||
|
log.addHandler(fileHandler)
|
||
|
|
||
|
# Stdout logs
|
||
|
consoleHandler = logging.StreamHandler()
|
||
|
consoleHandler.setFormatter(logFormatter)
|
||
|
log.addHandler(consoleHandler)
|
||
|
|
||
|
# Log level
|
||
|
log.setLevel(logging.INFO)
|
||
|
|
||
|
Windows = True if platform.system() == "Windows" else False
|
||
|
TikaVersion = os.getenv('TIKA_VERSION', '1.23')
|
||
|
TikaJarPath = os.getenv('TIKA_PATH', tempfile.gettempdir())
|
||
|
TikaFilesPath = tempfile.gettempdir()
|
||
|
TikaServerLogFilePath = log_path
|
||
|
TikaServerJar = os.getenv(
|
||
|
'TIKA_SERVER_JAR',
|
||
|
"http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/"+TikaVersion+"/tika-server-"+TikaVersion+".jar")
|
||
|
ServerHost = "localhost"
|
||
|
Port = "9998"
|
||
|
ServerEndpoint = os.getenv(
|
||
|
'TIKA_SERVER_ENDPOINT', 'http://' + ServerHost + ':' + Port)
|
||
|
Translator = os.getenv(
|
||
|
'TIKA_TRANSLATOR',
|
||
|
"org.apache.tika.language.translate.Lingo24Translator")
|
||
|
TikaClientOnly = os.getenv('TIKA_CLIENT_ONLY', False)
|
||
|
TikaServerClasspath = os.getenv('TIKA_SERVER_CLASSPATH', '')
|
||
|
TikaStartupSleep = float(os.getenv('TIKA_STARTUP_SLEEP', 5))
|
||
|
TikaStartupMaxRetry = int(os.getenv('TIKA_STARTUP_MAX_RETRY', 3))
|
||
|
TikaJava = os.getenv("TIKA_JAVA", "java")
|
||
|
TikaJavaArgs = os.getenv("TIKA_JAVA_ARGS", '')
|
||
|
|
||
|
Verbose = 0
|
||
|
EncodeUtf8 = 0
|
||
|
csvOutput = 0
|
||
|
|
||
|
class TikaException(Exception):
|
||
|
pass
|
||
|
|
||
|
def echo2(*s): sys.stderr.write(unicode_string('tika.py: %s\n') % unicode_string(' ').join(map(unicode_string, s)))
|
||
|
def warn(*s): echo2('Warn:', *s)
|
||
|
def die(*s): warn('Error:', *s); echo2(USAGE); sys.exit()
|
||
|
|
||
|
def runCommand(cmd, option, urlOrPaths, port, outDir=None,
|
||
|
serverHost=ServerHost, tikaServerJar=TikaServerJar,
|
||
|
verbose=Verbose, encode=EncodeUtf8):
|
||
|
'''
|
||
|
Run the Tika command by calling the Tika server and return results in JSON format (or plain text).
|
||
|
:param cmd: a command from set ``{'parse', 'detect', 'language', 'translate', 'config'}``
|
||
|
:param option:
|
||
|
:param urlOrPaths:
|
||
|
:param port:
|
||
|
:param outDir:
|
||
|
:param serverHost:
|
||
|
:param tikaServerJar:
|
||
|
:param verbose:
|
||
|
:param encode:
|
||
|
:return: response for the command, usually a ``dict``
|
||
|
'''
|
||
|
# import pdb; pdb.set_trace()
|
||
|
if (cmd in 'parse' or cmd in 'detect') and (urlOrPaths == [] or urlOrPaths == None):
|
||
|
log.exception('No URLs/paths specified.')
|
||
|
raise TikaException('No URLs/paths specified.')
|
||
|
serverEndpoint = 'http://' + serverHost + ':' + port
|
||
|
if cmd == 'parse':
|
||
|
return parseAndSave(option, urlOrPaths, outDir, serverEndpoint, verbose, tikaServerJar)
|
||
|
elif cmd == "detect":
|
||
|
return detectType(option, urlOrPaths, serverEndpoint, verbose, tikaServerJar)
|
||
|
elif cmd == "language":
|
||
|
return detectLang(option, urlOrPaths, serverEndpoint, verbose, tikaServerJar)
|
||
|
elif cmd == "translate":
|
||
|
return doTranslate(option, urlOrPaths, serverEndpoint, verbose, tikaServerJar)
|
||
|
elif cmd == "config":
|
||
|
status, resp = getConfig(option, serverEndpoint, verbose, tikaServerJar)
|
||
|
return resp
|
||
|
else:
|
||
|
log.exception('Bad args')
|
||
|
raise TikaException('Bad args')
|
||
|
|
||
|
|
||
|
def getPaths(urlOrPaths):
|
||
|
'''
|
||
|
Determines if the given URL in urlOrPaths is a URL or a file or directory. If it's
|
||
|
a directory, it walks the directory and then finds all file paths in it, and ads them
|
||
|
too. If it's a file, it adds it to the paths. If it's a URL it just adds it to the path.
|
||
|
:param urlOrPaths: the url or path to be scanned
|
||
|
:return: ``list`` of paths
|
||
|
'''
|
||
|
if isinstance(urlOrPaths, unicode_string):
|
||
|
urlOrPaths = [urlOrPaths] # do not recursively walk over letters of a single path which can include "/"
|
||
|
paths = []
|
||
|
for eachUrlOrPaths in urlOrPaths:
|
||
|
if os.path.isdir(eachUrlOrPaths):
|
||
|
for root, directories, filenames in walk(eachUrlOrPaths):
|
||
|
for filename in filenames:
|
||
|
paths.append(os.path.join(root,filename))
|
||
|
else:
|
||
|
paths.append(eachUrlOrPaths)
|
||
|
return paths
|
||
|
|
||
|
def parseAndSave(option, urlOrPaths, outDir=None, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
|
||
|
responseMimeType='application/json', metaExtension='_meta.json',
|
||
|
services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta'}):
|
||
|
'''
|
||
|
Parse the objects and write extracted metadata and/or text in JSON format to matching
|
||
|
filename with an extension of '_meta.json'.
|
||
|
:param option:
|
||
|
:param urlOrPaths:
|
||
|
:param outDir:
|
||
|
:param serverEndpoint:
|
||
|
:param verbose:
|
||
|
:param tikaServerJar:
|
||
|
:param responseMimeType:
|
||
|
:param metaExtension:
|
||
|
:param services:
|
||
|
:return:
|
||
|
'''
|
||
|
metaPaths = []
|
||
|
paths = getPaths(urlOrPaths)
|
||
|
for path in paths:
|
||
|
if outDir is None:
|
||
|
metaPath = path + metaExtension
|
||
|
else:
|
||
|
metaPath = os.path.join(outDir, os.path.split(path)[1] + metaExtension)
|
||
|
log.info('Writing %s' % metaPath)
|
||
|
with open(metaPath, 'w', encoding='utf-8') as f:
|
||
|
f.write(parse1(option, path, serverEndpoint, verbose, tikaServerJar, \
|
||
|
responseMimeType, services)[1] + u"\n")
|
||
|
metaPaths.append(metaPath)
|
||
|
return metaPaths
|
||
|
|
||
|
|
||
|
def parse(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
|
||
|
responseMimeType='application/json',
|
||
|
services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta'}, rawResponse=False):
|
||
|
'''
|
||
|
Parse the objects and return extracted metadata and/or text in JSON format.
|
||
|
:param option:
|
||
|
:param urlOrPaths:
|
||
|
:param serverEndpoint:
|
||
|
:param verbose:
|
||
|
:param tikaServerJar:
|
||
|
:param responseMimeType:
|
||
|
:param services:
|
||
|
:return:
|
||
|
'''
|
||
|
return [parse1(option, path, serverEndpoint, verbose, tikaServerJar, responseMimeType, services)
|
||
|
for path in urlOrPaths]
|
||
|
|
||
|
def parse1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
|
||
|
responseMimeType='application/json',
|
||
|
services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta/text'}, rawResponse=False, headers=None, config_path=None, requestOptions={}):
|
||
|
'''
|
||
|
Parse the object and return extracted metadata and/or text in JSON format.
|
||
|
:param option:
|
||
|
:param urlOrPath:
|
||
|
:param serverEndpoint:
|
||
|
:param verbose:
|
||
|
:param tikaServerJar:
|
||
|
:param responseMimeType:
|
||
|
:param services:
|
||
|
:param rawResponse:
|
||
|
:param headers:
|
||
|
:return:
|
||
|
'''
|
||
|
headers = headers or {}
|
||
|
|
||
|
path, file_type = getRemoteFile(urlOrPath, TikaFilesPath)
|
||
|
headers.update({'Accept': responseMimeType, 'Content-Disposition': make_content_disposition_header(path.encode('utf-8') if type(path) is unicode_string else path)})
|
||
|
|
||
|
if option not in services:
|
||
|
log.warning('config option must be one of meta, text, or all; using all.')
|
||
|
service = services.get(option, services['all'])
|
||
|
if service == '/tika': responseMimeType = 'text/plain'
|
||
|
headers.update({'Accept': responseMimeType, 'Content-Disposition': make_content_disposition_header(path.encode('utf-8') if type(path) is unicode_string else path)})
|
||
|
with open(path, 'rb') as f:
|
||
|
status, response = callServer('put', serverEndpoint, service, f,
|
||
|
headers, verbose, tikaServerJar, config_path=config_path,
|
||
|
rawResponse=rawResponse, requestOptions=requestOptions)
|
||
|
|
||
|
if file_type == 'remote': os.unlink(path)
|
||
|
return (status, response)
|
||
|
|
||
|
def detectLang(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
|
||
|
responseMimeType='text/plain',
|
||
|
services={'file' : '/language/stream'}):
|
||
|
'''
|
||
|
Detect the language of the provided stream and return its 2 character code as text/plain.
|
||
|
:param option:
|
||
|
:param urlOrPaths:
|
||
|
:param serverEndpoint:
|
||
|
:param verbose:
|
||
|
:param tikaServerJar:
|
||
|
:param responseMimeType:
|
||
|
:param services:
|
||
|
:return:
|
||
|
'''
|
||
|
paths = getPaths(urlOrPaths)
|
||
|
return [detectLang1(option, path, serverEndpoint, verbose, tikaServerJar, responseMimeType, services)
|
||
|
for path in paths]
|
||
|
|
||
|
def detectLang1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
|
||
|
responseMimeType='text/plain',
|
||
|
services={'file' : '/language/stream'}, requestOptions={}):
|
||
|
'''
|
||
|
Detect the language of the provided stream and return its 2 character code as text/plain.
|
||
|
:param option:
|
||
|
:param urlOrPath:
|
||
|
:param serverEndpoint:
|
||
|
:param verbose:
|
||
|
:param tikaServerJar:
|
||
|
:param responseMimeType:
|
||
|
:param services:
|
||
|
:return:
|
||
|
'''
|
||
|
path, mode = getRemoteFile(urlOrPath, TikaFilesPath)
|
||
|
if option not in services:
|
||
|
log.exception('Language option must be one of %s ' % binary_string(services.keys()))
|
||
|
raise TikaException('Language option must be one of %s ' % binary_string(services.keys()))
|
||
|
service = services[option]
|
||
|
status, response = callServer('put', serverEndpoint, service, open(path, 'rb'),
|
||
|
{'Accept': responseMimeType}, verbose, tikaServerJar, requestOptions=requestOptions)
|
||
|
return (status, response)
|
||
|
|
||
|
def doTranslate(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
|
||
|
responseMimeType='text/plain',
|
||
|
services={'all': '/translate/all'}):
|
||
|
'''
|
||
|
Translate the file from source language to destination language.
|
||
|
:param option:
|
||
|
:param urlOrPaths:
|
||
|
:param serverEndpoint:
|
||
|
:param verbose:
|
||
|
:param tikaServerJar:
|
||
|
:param responseMimeType:
|
||
|
:param services:
|
||
|
:return:
|
||
|
'''
|
||
|
paths = getPaths(urlOrPaths)
|
||
|
return [doTranslate1(option, path, serverEndpoint, verbose, tikaServerJar, responseMimeType, services)
|
||
|
for path in paths]
|
||
|
|
||
|
def doTranslate1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
|
||
|
responseMimeType='text/plain',
|
||
|
services={'all': '/translate/all'}, requestOptions={}):
|
||
|
'''
|
||
|
|
||
|
:param option:
|
||
|
:param urlOrPath:
|
||
|
:param serverEndpoint:
|
||
|
:param verbose:
|
||
|
:param tikaServerJar:
|
||
|
:param responseMimeType:
|
||
|
:param services:
|
||
|
:return:
|
||
|
'''
|
||
|
path, mode = getRemoteFile(urlOrPath, TikaFilesPath)
|
||
|
srcLang = ""
|
||
|
destLang = ""
|
||
|
|
||
|
if ":" in option:
|
||
|
options = option.rsplit(':')
|
||
|
srcLang = options[0]
|
||
|
destLang = options[1]
|
||
|
if len(options) != 2:
|
||
|
log.exception('Translate options are specified as srcLang:destLang or as destLang')
|
||
|
raise TikaException('Translate options are specified as srcLang:destLang or as destLang')
|
||
|
else:
|
||
|
destLang = option
|
||
|
|
||
|
if srcLang != "" and destLang != "":
|
||
|
service = services["all"] + "/" + Translator + "/" + srcLang + "/" + destLang
|
||
|
else:
|
||
|
service = services["all"] + "/" + Translator + "/" + destLang
|
||
|
status, response = callServer('put', serverEndpoint, service, open(path, 'rb'),
|
||
|
{'Accept' : responseMimeType},
|
||
|
verbose, tikaServerJar, requestOptions=requestOptions)
|
||
|
return (status, response)
|
||
|
|
||
|
def detectType(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
|
||
|
responseMimeType='text/plain',
|
||
|
services={'type': '/detect/stream'}):
|
||
|
'''
|
||
|
Detect the MIME/media type of the stream and return it in text/plain.
|
||
|
:param option:
|
||
|
:param urlOrPaths:
|
||
|
:param serverEndpoint:
|
||
|
:param verbose:
|
||
|
:param tikaServerJar:
|
||
|
:param responseMimeType:
|
||
|
:param services:
|
||
|
:return:
|
||
|
'''
|
||
|
paths = getPaths(urlOrPaths)
|
||
|
return [detectType1(option, path, serverEndpoint, verbose, tikaServerJar, responseMimeType, services)
|
||
|
for path in paths]
|
||
|
|
||
|
def detectType1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
|
||
|
responseMimeType='text/plain',
|
||
|
services={'type': '/detect/stream'}, config_path=None, requestOptions={}):
|
||
|
'''
|
||
|
Detect the MIME/media type of the stream and return it in text/plain.
|
||
|
:param option:
|
||
|
:param urlOrPath:
|
||
|
:param serverEndpoint:
|
||
|
:param verbose:
|
||
|
:param tikaServerJar:
|
||
|
:param responseMimeType:
|
||
|
:param services:
|
||
|
:return:
|
||
|
'''
|
||
|
path, mode = getRemoteFile(urlOrPath, TikaFilesPath)
|
||
|
if option not in services:
|
||
|
log.exception('Detect option must be one of %s' % binary_string(services.keys()))
|
||
|
raise TikaException('Detect option must be one of %s' % binary_string(services.keys()))
|
||
|
service = services[option]
|
||
|
status, response = callServer('put', serverEndpoint, service, open(path, 'rb'),
|
||
|
{
|
||
|
'Accept': responseMimeType,
|
||
|
'Content-Disposition': make_content_disposition_header(path)
|
||
|
},
|
||
|
verbose, tikaServerJar, config_path=config_path, requestOptions=requestOptions)
|
||
|
if csvOutput == 1:
|
||
|
return(status, urlOrPath.decode("UTF-8") + "," + response)
|
||
|
else:
|
||
|
return (status, response)
|
||
|
|
||
|
def getConfig(option, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, responseMimeType='application/json',
|
||
|
services={'mime-types': '/mime-types', 'detectors': '/detectors', 'parsers': '/parsers/details'}, requestOptions={}):
|
||
|
'''
|
||
|
Get the configuration of the Tika Server (parsers, detectors, etc.) and return it in JSON format.
|
||
|
:param option:
|
||
|
:param serverEndpoint:
|
||
|
:param verbose:
|
||
|
:param tikaServerJar:
|
||
|
:param responseMimeType:
|
||
|
:param services:
|
||
|
:return:
|
||
|
'''
|
||
|
if option not in services:
|
||
|
die('config option must be one of mime-types, detectors, or parsers')
|
||
|
service = services[option]
|
||
|
status, response = callServer('get', serverEndpoint, service, None, {'Accept': responseMimeType}, verbose, tikaServerJar, requestOptions=requestOptions)
|
||
|
return (status, response)
|
||
|
|
||
|
def callServer(verb, serverEndpoint, service, data, headers, verbose=Verbose, tikaServerJar=TikaServerJar,
|
||
|
httpVerbs={'get': requests.get, 'put': requests.put, 'post': requests.post}, classpath=None,
|
||
|
rawResponse=False,config_path=None, requestOptions={}):
|
||
|
'''
|
||
|
Call the Tika Server, do some error checking, and return the response.
|
||
|
:param verb:
|
||
|
:param serverEndpoint:
|
||
|
:param service:
|
||
|
:param data:
|
||
|
:param headers:
|
||
|
:param verbose:
|
||
|
:param tikaServerJar:
|
||
|
:param httpVerbs:
|
||
|
:param classpath:
|
||
|
:return:
|
||
|
'''
|
||
|
parsedUrl = urlparse(serverEndpoint)
|
||
|
serverHost = parsedUrl.hostname
|
||
|
scheme = parsedUrl.scheme
|
||
|
|
||
|
port = parsedUrl.port
|
||
|
if classpath is None:
|
||
|
classpath = TikaServerClasspath
|
||
|
|
||
|
global TikaClientOnly
|
||
|
if not TikaClientOnly:
|
||
|
serverEndpoint = checkTikaServer(scheme, serverHost, port, tikaServerJar, classpath, config_path)
|
||
|
|
||
|
serviceUrl = serverEndpoint + service
|
||
|
if verb not in httpVerbs:
|
||
|
log.exception('Tika Server call must be one of %s' % binary_string(httpVerbs.keys()))
|
||
|
raise TikaException('Tika Server call must be one of %s' % binary_string(httpVerbs.keys()))
|
||
|
verbFn = httpVerbs[verb]
|
||
|
|
||
|
if Windows and hasattr(data, "read"):
|
||
|
data = data.read()
|
||
|
|
||
|
encodedData = data
|
||
|
if type(data) is unicode_string:
|
||
|
encodedData = data.encode('utf-8')
|
||
|
|
||
|
requestOptionsDefault = {
|
||
|
'timeout': 60,
|
||
|
'headers': headers,
|
||
|
'verify': False
|
||
|
}
|
||
|
effectiveRequestOptions = requestOptionsDefault.copy()
|
||
|
effectiveRequestOptions.update(requestOptions)
|
||
|
|
||
|
resp = verbFn(serviceUrl, encodedData, **effectiveRequestOptions)
|
||
|
|
||
|
if verbose:
|
||
|
print(sys.stderr, "Request headers: ", headers)
|
||
|
print(sys.stderr, "Response headers: ", resp.headers)
|
||
|
if resp.status_code != 200:
|
||
|
log.warning('Tika server returned status: %d', resp.status_code)
|
||
|
|
||
|
resp.encoding = "utf-8"
|
||
|
if rawResponse:
|
||
|
return (resp.status_code, resp.content)
|
||
|
else:
|
||
|
return (resp.status_code, resp.text)
|
||
|
|
||
|
|
||
|
def checkTikaServer(scheme="http", serverHost=ServerHost, port=Port, tikaServerJar=TikaServerJar, classpath=None, config_path=None):
|
||
|
'''
|
||
|
Check that tika-server is running. If not, download JAR file and start it up.
|
||
|
:param scheme: e.g. http or https
|
||
|
:param serverHost:
|
||
|
:param port:
|
||
|
:param tikaServerJar:
|
||
|
:param classpath:
|
||
|
:return:
|
||
|
'''
|
||
|
if classpath is None:
|
||
|
classpath = TikaServerClasspath
|
||
|
if port is None:
|
||
|
port = '443' if scheme == 'https' else '80'
|
||
|
|
||
|
urlp = urlparse(tikaServerJar)
|
||
|
serverEndpoint = '%s://%s:%s' % (scheme, serverHost, port)
|
||
|
jarPath = os.path.join(TikaJarPath, 'tika-server.jar')
|
||
|
if 'localhost' in serverEndpoint or '127.0.0.1' in serverEndpoint:
|
||
|
alreadyRunning = checkPortIsOpen(serverHost, port)
|
||
|
|
||
|
if not alreadyRunning:
|
||
|
if not os.path.isfile(jarPath) and urlp.scheme != '':
|
||
|
getRemoteJar(tikaServerJar, jarPath)
|
||
|
|
||
|
if not checkJarSig(tikaServerJar, jarPath):
|
||
|
os.remove(jarPath)
|
||
|
tikaServerJar = getRemoteJar(tikaServerJar, jarPath)
|
||
|
|
||
|
status = startServer(jarPath, TikaJava, TikaJavaArgs, serverHost, port, classpath, config_path)
|
||
|
if not status:
|
||
|
log.error("Failed to receive startup confirmation from startServer.")
|
||
|
raise RuntimeError("Unable to start Tika server.")
|
||
|
return serverEndpoint
|
||
|
|
||
|
def checkJarSig(tikaServerJar, jarPath):
|
||
|
'''
|
||
|
Checks the signature of Jar
|
||
|
:param tikaServerJar:
|
||
|
:param jarPath:
|
||
|
:return: ``True`` if the signature of the jar matches
|
||
|
'''
|
||
|
if not os.path.isfile(jarPath + ".md5"):
|
||
|
getRemoteJar(tikaServerJar + ".md5", jarPath + ".md5")
|
||
|
m = hashlib.md5()
|
||
|
with open(jarPath, 'rb') as f:
|
||
|
binContents = f.read()
|
||
|
m.update(binContents)
|
||
|
with open(jarPath + ".md5", "r") as em:
|
||
|
existingContents = em.read()
|
||
|
return existingContents == m.hexdigest()
|
||
|
|
||
|
|
||
|
def startServer(tikaServerJar, java_path = TikaJava, java_args = TikaJavaArgs, serverHost = ServerHost, port = Port, classpath=None, config_path=None):
|
||
|
'''
|
||
|
Starts Tika Server
|
||
|
:param tikaServerJar: path to tika server jar
|
||
|
:param serverHost: the host interface address to be used for binding the service
|
||
|
:param port: the host port to be used for binding the service
|
||
|
:param classpath: Class path value to pass to JVM
|
||
|
:return: None
|
||
|
'''
|
||
|
if classpath is None:
|
||
|
classpath = TikaServerClasspath
|
||
|
|
||
|
host = "localhost"
|
||
|
if Windows:
|
||
|
host = "0.0.0.0"
|
||
|
|
||
|
if classpath:
|
||
|
classpath += ":" + tikaServerJar
|
||
|
else:
|
||
|
classpath = tikaServerJar
|
||
|
|
||
|
# setup command string
|
||
|
cmd_string = ""
|
||
|
if not config_path:
|
||
|
cmd_string = '%s %s -cp %s org.apache.tika.server.TikaServerCli --port %s --host %s &' \
|
||
|
% (java_path, java_args, classpath, port, host)
|
||
|
else:
|
||
|
cmd_string = '%s %s -cp %s org.apache.tika.server.TikaServerCli --port %s --host %s --config %s &' \
|
||
|
% (java_path, java_args, classpath, port, host, config_path)
|
||
|
|
||
|
# Check that we can write to log path
|
||
|
try:
|
||
|
tika_log_file_path = os.path.join(TikaServerLogFilePath, 'tika-server.log')
|
||
|
logFile = open(tika_log_file_path, 'w')
|
||
|
except PermissionError as e:
|
||
|
log.error("Unable to create tika-server.log at %s due to permission error." % (TikaServerLogFilePath))
|
||
|
return False
|
||
|
|
||
|
# Check that specified java binary is available on path
|
||
|
try:
|
||
|
_ = Popen(java_path, stdout=open(os.devnull, "w"), stderr=open(os.devnull, "w"))
|
||
|
except FileNotFoundError as e:
|
||
|
log.error("Unable to run java; is it installed?")
|
||
|
return False
|
||
|
|
||
|
# Run java with jar args
|
||
|
cmd = Popen(cmd_string, stdout=logFile, stderr=STDOUT, shell=True)
|
||
|
|
||
|
# Check logs and retry as configured
|
||
|
try_count = 0
|
||
|
is_started = False
|
||
|
while try_count < TikaStartupMaxRetry:
|
||
|
with open(tika_log_file_path, "r") as tika_log_file_tmp:
|
||
|
# check for INFO string to confirm listening endpoint
|
||
|
if "Started Apache Tika server at" in tika_log_file_tmp.read():
|
||
|
is_started = True
|
||
|
else:
|
||
|
log.warning("Failed to see startup log message; retrying...")
|
||
|
time.sleep(TikaStartupSleep)
|
||
|
try_count += 1
|
||
|
|
||
|
if not is_started:
|
||
|
log.error("Tika startup log message not received after %d tries." % (TikaStartupMaxRetry))
|
||
|
return False
|
||
|
else:
|
||
|
return True
|
||
|
|
||
|
def toFilename(url):
|
||
|
'''
|
||
|
gets url and returns filename
|
||
|
'''
|
||
|
urlp = urlparse(url)
|
||
|
path = urlp.path
|
||
|
if not path:
|
||
|
path = "file_{}".format(int(time.time()))
|
||
|
value = re.sub(r'[^\w\s\.\-]', '-', path).strip().lower()
|
||
|
return re.sub(r'[-\s]+', '-', value).strip("-")[-200:]
|
||
|
|
||
|
|
||
|
def getRemoteFile(urlOrPath, destPath):
|
||
|
'''
|
||
|
Fetches URL to local path or just returns absolute path.
|
||
|
:param urlOrPath: resource locator, generally URL or path
|
||
|
:param destPath: path to store the resource, usually a path on file system
|
||
|
:return: tuple having (path, 'local'/'remote')
|
||
|
'''
|
||
|
urlp = urlparse(urlOrPath)
|
||
|
if urlp.scheme == '':
|
||
|
return (os.path.abspath(urlOrPath), 'local')
|
||
|
elif urlp.scheme not in ('http', 'https'):
|
||
|
return (urlOrPath, 'local')
|
||
|
else:
|
||
|
filename = toFilename(urlOrPath)
|
||
|
destPath = destPath + '/' + filename
|
||
|
log.info('Retrieving %s to %s.' % (urlOrPath, destPath))
|
||
|
try:
|
||
|
urlretrieve(urlOrPath, destPath)
|
||
|
except IOError:
|
||
|
# monkey patch fix for SSL/Windows per Tika-Python #54
|
||
|
# https://github.com/chrismattmann/tika-python/issues/54
|
||
|
import ssl
|
||
|
if hasattr(ssl, '_create_unverified_context'):
|
||
|
ssl._create_default_https_context = ssl._create_unverified_context
|
||
|
# delete whatever we had there
|
||
|
if os.path.exists(destPath) and os.path.isfile(destPath):
|
||
|
os.remove(destPath)
|
||
|
urlretrieve(urlOrPath, destPath)
|
||
|
return (destPath, 'remote')
|
||
|
|
||
|
def getRemoteJar(urlOrPath, destPath):
|
||
|
'''
|
||
|
Fetches URL to local path or just return absolute path.
|
||
|
:param urlOrPath: remote resource locator
|
||
|
:param destPath: Path to store the resource, usually a path on file system
|
||
|
:return: tuple having (path, 'local'/'remote')
|
||
|
'''
|
||
|
urlp = urlparse(urlOrPath)
|
||
|
if urlp.scheme == '':
|
||
|
return (os.path.abspath(urlOrPath), 'local')
|
||
|
else:
|
||
|
log.info('Retrieving %s to %s.' % (urlOrPath, destPath))
|
||
|
try:
|
||
|
urlretrieve(urlOrPath, destPath)
|
||
|
except IOError:
|
||
|
# monkey patch fix for SSL/Windows per Tika-Python #54
|
||
|
# https://github.com/chrismattmann/tika-python/issues/54
|
||
|
import ssl
|
||
|
if hasattr(ssl, '_create_unverified_context'):
|
||
|
ssl._create_default_https_context = ssl._create_unverified_context
|
||
|
# delete whatever we had there
|
||
|
if os.path.exists(destPath) and os.path.isfile(destPath):
|
||
|
os.remove(destPath)
|
||
|
urlretrieve(urlOrPath, destPath)
|
||
|
|
||
|
return (destPath, 'remote')
|
||
|
|
||
|
def checkPortIsOpen(remoteServerHost=ServerHost, port = Port):
|
||
|
'''
|
||
|
Checks if the specified port is open
|
||
|
:param remoteServerHost: the host address
|
||
|
:param port: port which needs to be checked
|
||
|
:return: ``True`` if port is open, ``False`` otherwise
|
||
|
'''
|
||
|
remoteServerIP = socket.gethostbyname(remoteServerHost)
|
||
|
try:
|
||
|
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||
|
result = sock.connect_ex((remoteServerIP, int(port)))
|
||
|
if result == 0:
|
||
|
return True
|
||
|
else :
|
||
|
return False
|
||
|
|
||
|
except KeyboardInterrupt:
|
||
|
print("You pressed Ctrl+C")
|
||
|
sys.exit()
|
||
|
|
||
|
except socket.gaierror:
|
||
|
print('Hostname could not be resolved. Exiting')
|
||
|
sys.exit()
|
||
|
|
||
|
except socket.error:
|
||
|
print("Couldn't connect to server")
|
||
|
sys.exit()
|
||
|
|
||
|
finally:
|
||
|
sock.close()
|
||
|
|
||
|
def main(argv=None):
|
||
|
"""Run Tika from command line according to USAGE."""
|
||
|
global Verbose
|
||
|
global EncodeUtf8
|
||
|
global csvOutput
|
||
|
if argv is None:
|
||
|
argv = sys.argv
|
||
|
|
||
|
if (len(argv) < 3 and not (('-h' in argv) or ('--help' in argv))):
|
||
|
log.exception('Bad args')
|
||
|
raise TikaException('Bad args')
|
||
|
try:
|
||
|
opts, argv = getopt.getopt(argv[1:], 'hi:s:o:p:v:e:c',
|
||
|
['help', 'install=', 'server=', 'output=', 'port=', 'verbose', 'encode', 'csv'])
|
||
|
except getopt.GetoptError as opt_error:
|
||
|
msg, bad_opt = opt_error
|
||
|
log.exception("%s error: Bad option: %s, %s" % (argv[0], bad_opt, msg))
|
||
|
raise TikaException("%s error: Bad option: %s, %s" % (argv[0], bad_opt, msg))
|
||
|
|
||
|
tikaServerJar = TikaServerJar
|
||
|
serverHost = ServerHost
|
||
|
outDir = '.'
|
||
|
port = Port
|
||
|
for opt, val in opts:
|
||
|
if opt in ('-h', '--help'): echo2(USAGE); sys.exit()
|
||
|
elif opt in ('--install'): tikaServerJar = val
|
||
|
elif opt in ('--server'): serverHost = val
|
||
|
elif opt in ('-o', '--output'): outDir = val
|
||
|
elif opt in ('--port'): port = val
|
||
|
elif opt in ('-v', '--verbose'): Verbose = 1
|
||
|
elif opt in ('-e', '--encode'): EncodeUtf8 = 1
|
||
|
elif opt in ('-c', '--csv'): csvOutput = 1
|
||
|
else:
|
||
|
raise TikaException(USAGE)
|
||
|
|
||
|
cmd = argv[0]
|
||
|
option = argv[1]
|
||
|
try:
|
||
|
paths = argv[2:]
|
||
|
except:
|
||
|
paths = None
|
||
|
return runCommand(cmd, option, paths, port, outDir, serverHost=serverHost, tikaServerJar=tikaServerJar, verbose=Verbose, encode=EncodeUtf8)
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
log.info("Logging on '%s'" % (log_file))
|
||
|
resp = main(sys.argv)
|
||
|
|
||
|
# Set encoding of the terminal to UTF-8
|
||
|
if sys.version.startswith("2"):
|
||
|
# Python 2.x
|
||
|
out = codecs.getwriter("UTF-8")(sys.stdout)
|
||
|
elif sys.version.startswith("3"):
|
||
|
# Python 3.x
|
||
|
out = codecs.getwriter("UTF-8")(sys.stdout.buffer)
|
||
|
|
||
|
if type(resp) == list:
|
||
|
out.write('\n'.join([r[1] for r in resp]))
|
||
|
else:
|
||
|
out.write(resp)
|
||
|
out.write('\n')
|
||
|
|