PCQRSCANER/venv/Lib/site-packages/tika/tika.py
2019-12-22 21:51:47 +01:00

844 lines
32 KiB
Python

#!/usr/bin/env python
# encoding: utf-8
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Module documentation
'''
Tika Python module provides Python API client to Aapche Tika Server.
**Example usage**::
import tika
from tika import parser
parsed = parser.from_file('/path/to/file')
print(parsed["metadata"])
print(parsed["content"])
Visit https://github.com/chrismattmann/tika-python to learn more about it.
**Detect IANA MIME Type**::
from tika import detector
print(detector.from_file('/path/to/file'))
**Detect Language**::
from tika import language
print(language.from_file('/path/to/file'))
**Use Tika Translate**::
from tika import translate
print(translate.from_file('/path/to/file', 'srcLang', 'destLang')
# Use auto Language detection feature
print(translate.from_file('/path/to/file', 'destLang')
***Tika-Python Configuration***
You can now use custom configuration files. See https://tika.apache.org/1.18/configuring.html
for details on writing configuration files. Configuration is set the first time the server is started.
To use a configuration file with a parser, or detector:
parsed = parser.from_file('/path/to/file', config_path='/path/to/configfile')
or:
detected = detector.from_file('/path/to/file', config_path='/path/to/configfile')
or:
detected = detector.from_buffer('some buffered content', config_path='/path/to/configfile')
'''
USAGE = """
tika.py [-v] [-e] [-o <outputDir>] [--server <TikaServerEndpoint>] [--install <UrlToTikaServerJar>] [--port <portNumber>] <command> <option> <urlOrPathToFile>
tika.py parse all test.pdf test2.pdf (write output JSON metadata files for test1.pdf_meta.json and test2.pdf_meta.json)
tika.py detect type test.pdf (returns mime-type as text/plain)
tika.py language file french.txt (returns language e.g., fr as text/plain)
tika.py translate fr:en french.txt (translates the file french.txt from french to english)
tika.py config mime-types (see what mime-types the Tika Server can handle)
A simple python and command-line client for Tika using the standalone Tika server (JAR file).
All commands return results in JSON format by default (except text in text/plain).
To parse docs, use:
tika.py parse <meta | text | all> <path>
To check the configuration of the Tika server, use:
tika.py config <mime-types | detectors | parsers>
Commands:
parse = parse the input file and write a JSON doc file.ext_meta.json containing the extracted metadata, text, or both
detect type = parse the stream and 'detect' the MIME/media type, return in text/plain
language file = parse the file stream and identify the language of the text, return its 2 character code in text/plain
translate src:dest = parse and extract text and then translate the text from source language to destination language
config = return a JSON doc describing the configuration of the Tika server (i.e. mime-types it
can handle, or installed detectors or parsers)
Arguments:
urlOrPathToFile = file to be parsed, if URL it will first be retrieved and then passed to Tika
Switches:
--verbose, -v = verbose mode
--encode, -e = encode response in UTF-8
--csv, -c = report detect output in comma-delimited format
--server <TikaServerEndpoint> = use a remote Tika Server at this endpoint, otherwise use local server
--install <UrlToTikaServerJar> = download and exec Tika Server (JAR file), starting server on default port 9998
Example usage as python client:
-- from tika import runCommand, parse1
-- jsonOutput = runCommand('parse', 'all', filename)
or
-- jsonOutput = parse1('all', filename)
"""
import sys, os, getopt, time, codecs, re
try:
unicode_string = unicode
binary_string = str
except NameError:
unicode_string = str
binary_string = bytes
try:
from urllib import urlretrieve
except ImportError:
from urllib.request import urlretrieve
try:
from urlparse import urlparse
except ImportError:
from urllib.parse import urlparse as urlparse
try:
from rfc6266 import build_header
def make_content_disposition_header(fn):
return build_header(os.path.basename(fn)).decode('ascii')
except ImportError:
def make_content_disposition_header(fn):
return 'attachment; filename=%s' % os.path.basename(fn)
if sys.version_info[0] < 3:
open = codecs.open
import requests
import socket
import tempfile
import hashlib
import platform
from subprocess import Popen
from subprocess import STDOUT
from os import walk
import logging
log_path = os.getenv('TIKA_LOG_PATH', tempfile.gettempdir())
log_file = os.path.join(log_path, 'tika.log')
logFormatter = logging.Formatter("%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s] %(message)s")
log = logging.getLogger('tika.tika')
# File logs
fileHandler = logging.FileHandler(log_file)
fileHandler.setFormatter(logFormatter)
log.addHandler(fileHandler)
# Stdout logs
consoleHandler = logging.StreamHandler()
consoleHandler.setFormatter(logFormatter)
log.addHandler(consoleHandler)
# Log level
log.setLevel(logging.INFO)
Windows = True if platform.system() == "Windows" else False
TikaVersion = os.getenv('TIKA_VERSION', '1.23')
TikaJarPath = os.getenv('TIKA_PATH', tempfile.gettempdir())
TikaFilesPath = tempfile.gettempdir()
TikaServerLogFilePath = log_path
TikaServerJar = os.getenv(
'TIKA_SERVER_JAR',
"http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/"+TikaVersion+"/tika-server-"+TikaVersion+".jar")
ServerHost = "localhost"
Port = "9998"
ServerEndpoint = os.getenv(
'TIKA_SERVER_ENDPOINT', 'http://' + ServerHost + ':' + Port)
Translator = os.getenv(
'TIKA_TRANSLATOR',
"org.apache.tika.language.translate.Lingo24Translator")
TikaClientOnly = os.getenv('TIKA_CLIENT_ONLY', False)
TikaServerClasspath = os.getenv('TIKA_SERVER_CLASSPATH', '')
TikaStartupSleep = float(os.getenv('TIKA_STARTUP_SLEEP', 5))
TikaStartupMaxRetry = int(os.getenv('TIKA_STARTUP_MAX_RETRY', 3))
TikaJava = os.getenv("TIKA_JAVA", "java")
TikaJavaArgs = os.getenv("TIKA_JAVA_ARGS", '')
Verbose = 0
EncodeUtf8 = 0
csvOutput = 0
class TikaException(Exception):
pass
def echo2(*s): sys.stderr.write(unicode_string('tika.py: %s\n') % unicode_string(' ').join(map(unicode_string, s)))
def warn(*s): echo2('Warn:', *s)
def die(*s): warn('Error:', *s); echo2(USAGE); sys.exit()
def runCommand(cmd, option, urlOrPaths, port, outDir=None,
serverHost=ServerHost, tikaServerJar=TikaServerJar,
verbose=Verbose, encode=EncodeUtf8):
'''
Run the Tika command by calling the Tika server and return results in JSON format (or plain text).
:param cmd: a command from set ``{'parse', 'detect', 'language', 'translate', 'config'}``
:param option:
:param urlOrPaths:
:param port:
:param outDir:
:param serverHost:
:param tikaServerJar:
:param verbose:
:param encode:
:return: response for the command, usually a ``dict``
'''
# import pdb; pdb.set_trace()
if (cmd in 'parse' or cmd in 'detect') and (urlOrPaths == [] or urlOrPaths == None):
log.exception('No URLs/paths specified.')
raise TikaException('No URLs/paths specified.')
serverEndpoint = 'http://' + serverHost + ':' + port
if cmd == 'parse':
return parseAndSave(option, urlOrPaths, outDir, serverEndpoint, verbose, tikaServerJar)
elif cmd == "detect":
return detectType(option, urlOrPaths, serverEndpoint, verbose, tikaServerJar)
elif cmd == "language":
return detectLang(option, urlOrPaths, serverEndpoint, verbose, tikaServerJar)
elif cmd == "translate":
return doTranslate(option, urlOrPaths, serverEndpoint, verbose, tikaServerJar)
elif cmd == "config":
status, resp = getConfig(option, serverEndpoint, verbose, tikaServerJar)
return resp
else:
log.exception('Bad args')
raise TikaException('Bad args')
def getPaths(urlOrPaths):
'''
Determines if the given URL in urlOrPaths is a URL or a file or directory. If it's
a directory, it walks the directory and then finds all file paths in it, and ads them
too. If it's a file, it adds it to the paths. If it's a URL it just adds it to the path.
:param urlOrPaths: the url or path to be scanned
:return: ``list`` of paths
'''
if isinstance(urlOrPaths, unicode_string):
urlOrPaths = [urlOrPaths] # do not recursively walk over letters of a single path which can include "/"
paths = []
for eachUrlOrPaths in urlOrPaths:
if os.path.isdir(eachUrlOrPaths):
for root, directories, filenames in walk(eachUrlOrPaths):
for filename in filenames:
paths.append(os.path.join(root,filename))
else:
paths.append(eachUrlOrPaths)
return paths
def parseAndSave(option, urlOrPaths, outDir=None, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
responseMimeType='application/json', metaExtension='_meta.json',
services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta'}):
'''
Parse the objects and write extracted metadata and/or text in JSON format to matching
filename with an extension of '_meta.json'.
:param option:
:param urlOrPaths:
:param outDir:
:param serverEndpoint:
:param verbose:
:param tikaServerJar:
:param responseMimeType:
:param metaExtension:
:param services:
:return:
'''
metaPaths = []
paths = getPaths(urlOrPaths)
for path in paths:
if outDir is None:
metaPath = path + metaExtension
else:
metaPath = os.path.join(outDir, os.path.split(path)[1] + metaExtension)
log.info('Writing %s' % metaPath)
with open(metaPath, 'w', encoding='utf-8') as f:
f.write(parse1(option, path, serverEndpoint, verbose, tikaServerJar, \
responseMimeType, services)[1] + u"\n")
metaPaths.append(metaPath)
return metaPaths
def parse(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
responseMimeType='application/json',
services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta'}, rawResponse=False):
'''
Parse the objects and return extracted metadata and/or text in JSON format.
:param option:
:param urlOrPaths:
:param serverEndpoint:
:param verbose:
:param tikaServerJar:
:param responseMimeType:
:param services:
:return:
'''
return [parse1(option, path, serverEndpoint, verbose, tikaServerJar, responseMimeType, services)
for path in urlOrPaths]
def parse1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
responseMimeType='application/json',
services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta/text'}, rawResponse=False, headers=None, config_path=None, requestOptions={}):
'''
Parse the object and return extracted metadata and/or text in JSON format.
:param option:
:param urlOrPath:
:param serverEndpoint:
:param verbose:
:param tikaServerJar:
:param responseMimeType:
:param services:
:param rawResponse:
:param headers:
:return:
'''
headers = headers or {}
path, file_type = getRemoteFile(urlOrPath, TikaFilesPath)
headers.update({'Accept': responseMimeType, 'Content-Disposition': make_content_disposition_header(path.encode('utf-8') if type(path) is unicode_string else path)})
if option not in services:
log.warning('config option must be one of meta, text, or all; using all.')
service = services.get(option, services['all'])
if service == '/tika': responseMimeType = 'text/plain'
headers.update({'Accept': responseMimeType, 'Content-Disposition': make_content_disposition_header(path.encode('utf-8') if type(path) is unicode_string else path)})
with open(path, 'rb') as f:
status, response = callServer('put', serverEndpoint, service, f,
headers, verbose, tikaServerJar, config_path=config_path,
rawResponse=rawResponse, requestOptions=requestOptions)
if file_type == 'remote': os.unlink(path)
return (status, response)
def detectLang(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
responseMimeType='text/plain',
services={'file' : '/language/stream'}):
'''
Detect the language of the provided stream and return its 2 character code as text/plain.
:param option:
:param urlOrPaths:
:param serverEndpoint:
:param verbose:
:param tikaServerJar:
:param responseMimeType:
:param services:
:return:
'''
paths = getPaths(urlOrPaths)
return [detectLang1(option, path, serverEndpoint, verbose, tikaServerJar, responseMimeType, services)
for path in paths]
def detectLang1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
responseMimeType='text/plain',
services={'file' : '/language/stream'}, requestOptions={}):
'''
Detect the language of the provided stream and return its 2 character code as text/plain.
:param option:
:param urlOrPath:
:param serverEndpoint:
:param verbose:
:param tikaServerJar:
:param responseMimeType:
:param services:
:return:
'''
path, mode = getRemoteFile(urlOrPath, TikaFilesPath)
if option not in services:
log.exception('Language option must be one of %s ' % binary_string(services.keys()))
raise TikaException('Language option must be one of %s ' % binary_string(services.keys()))
service = services[option]
status, response = callServer('put', serverEndpoint, service, open(path, 'rb'),
{'Accept': responseMimeType}, verbose, tikaServerJar, requestOptions=requestOptions)
return (status, response)
def doTranslate(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
responseMimeType='text/plain',
services={'all': '/translate/all'}):
'''
Translate the file from source language to destination language.
:param option:
:param urlOrPaths:
:param serverEndpoint:
:param verbose:
:param tikaServerJar:
:param responseMimeType:
:param services:
:return:
'''
paths = getPaths(urlOrPaths)
return [doTranslate1(option, path, serverEndpoint, verbose, tikaServerJar, responseMimeType, services)
for path in paths]
def doTranslate1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
responseMimeType='text/plain',
services={'all': '/translate/all'}, requestOptions={}):
'''
:param option:
:param urlOrPath:
:param serverEndpoint:
:param verbose:
:param tikaServerJar:
:param responseMimeType:
:param services:
:return:
'''
path, mode = getRemoteFile(urlOrPath, TikaFilesPath)
srcLang = ""
destLang = ""
if ":" in option:
options = option.rsplit(':')
srcLang = options[0]
destLang = options[1]
if len(options) != 2:
log.exception('Translate options are specified as srcLang:destLang or as destLang')
raise TikaException('Translate options are specified as srcLang:destLang or as destLang')
else:
destLang = option
if srcLang != "" and destLang != "":
service = services["all"] + "/" + Translator + "/" + srcLang + "/" + destLang
else:
service = services["all"] + "/" + Translator + "/" + destLang
status, response = callServer('put', serverEndpoint, service, open(path, 'rb'),
{'Accept' : responseMimeType},
verbose, tikaServerJar, requestOptions=requestOptions)
return (status, response)
def detectType(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
responseMimeType='text/plain',
services={'type': '/detect/stream'}):
'''
Detect the MIME/media type of the stream and return it in text/plain.
:param option:
:param urlOrPaths:
:param serverEndpoint:
:param verbose:
:param tikaServerJar:
:param responseMimeType:
:param services:
:return:
'''
paths = getPaths(urlOrPaths)
return [detectType1(option, path, serverEndpoint, verbose, tikaServerJar, responseMimeType, services)
for path in paths]
def detectType1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
responseMimeType='text/plain',
services={'type': '/detect/stream'}, config_path=None, requestOptions={}):
'''
Detect the MIME/media type of the stream and return it in text/plain.
:param option:
:param urlOrPath:
:param serverEndpoint:
:param verbose:
:param tikaServerJar:
:param responseMimeType:
:param services:
:return:
'''
path, mode = getRemoteFile(urlOrPath, TikaFilesPath)
if option not in services:
log.exception('Detect option must be one of %s' % binary_string(services.keys()))
raise TikaException('Detect option must be one of %s' % binary_string(services.keys()))
service = services[option]
status, response = callServer('put', serverEndpoint, service, open(path, 'rb'),
{
'Accept': responseMimeType,
'Content-Disposition': make_content_disposition_header(path)
},
verbose, tikaServerJar, config_path=config_path, requestOptions=requestOptions)
if csvOutput == 1:
return(status, urlOrPath.decode("UTF-8") + "," + response)
else:
return (status, response)
def getConfig(option, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, responseMimeType='application/json',
services={'mime-types': '/mime-types', 'detectors': '/detectors', 'parsers': '/parsers/details'}, requestOptions={}):
'''
Get the configuration of the Tika Server (parsers, detectors, etc.) and return it in JSON format.
:param option:
:param serverEndpoint:
:param verbose:
:param tikaServerJar:
:param responseMimeType:
:param services:
:return:
'''
if option not in services:
die('config option must be one of mime-types, detectors, or parsers')
service = services[option]
status, response = callServer('get', serverEndpoint, service, None, {'Accept': responseMimeType}, verbose, tikaServerJar, requestOptions=requestOptions)
return (status, response)
def callServer(verb, serverEndpoint, service, data, headers, verbose=Verbose, tikaServerJar=TikaServerJar,
httpVerbs={'get': requests.get, 'put': requests.put, 'post': requests.post}, classpath=None,
rawResponse=False,config_path=None, requestOptions={}):
'''
Call the Tika Server, do some error checking, and return the response.
:param verb:
:param serverEndpoint:
:param service:
:param data:
:param headers:
:param verbose:
:param tikaServerJar:
:param httpVerbs:
:param classpath:
:return:
'''
parsedUrl = urlparse(serverEndpoint)
serverHost = parsedUrl.hostname
scheme = parsedUrl.scheme
port = parsedUrl.port
if classpath is None:
classpath = TikaServerClasspath
global TikaClientOnly
if not TikaClientOnly:
serverEndpoint = checkTikaServer(scheme, serverHost, port, tikaServerJar, classpath, config_path)
serviceUrl = serverEndpoint + service
if verb not in httpVerbs:
log.exception('Tika Server call must be one of %s' % binary_string(httpVerbs.keys()))
raise TikaException('Tika Server call must be one of %s' % binary_string(httpVerbs.keys()))
verbFn = httpVerbs[verb]
if Windows and hasattr(data, "read"):
data = data.read()
encodedData = data
if type(data) is unicode_string:
encodedData = data.encode('utf-8')
requestOptionsDefault = {
'timeout': 60,
'headers': headers,
'verify': False
}
effectiveRequestOptions = requestOptionsDefault.copy()
effectiveRequestOptions.update(requestOptions)
resp = verbFn(serviceUrl, encodedData, **effectiveRequestOptions)
if verbose:
print(sys.stderr, "Request headers: ", headers)
print(sys.stderr, "Response headers: ", resp.headers)
if resp.status_code != 200:
log.warning('Tika server returned status: %d', resp.status_code)
resp.encoding = "utf-8"
if rawResponse:
return (resp.status_code, resp.content)
else:
return (resp.status_code, resp.text)
def checkTikaServer(scheme="http", serverHost=ServerHost, port=Port, tikaServerJar=TikaServerJar, classpath=None, config_path=None):
'''
Check that tika-server is running. If not, download JAR file and start it up.
:param scheme: e.g. http or https
:param serverHost:
:param port:
:param tikaServerJar:
:param classpath:
:return:
'''
if classpath is None:
classpath = TikaServerClasspath
if port is None:
port = '443' if scheme == 'https' else '80'
urlp = urlparse(tikaServerJar)
serverEndpoint = '%s://%s:%s' % (scheme, serverHost, port)
jarPath = os.path.join(TikaJarPath, 'tika-server.jar')
if 'localhost' in serverEndpoint or '127.0.0.1' in serverEndpoint:
alreadyRunning = checkPortIsOpen(serverHost, port)
if not alreadyRunning:
if not os.path.isfile(jarPath) and urlp.scheme != '':
getRemoteJar(tikaServerJar, jarPath)
if not checkJarSig(tikaServerJar, jarPath):
os.remove(jarPath)
tikaServerJar = getRemoteJar(tikaServerJar, jarPath)
status = startServer(jarPath, TikaJava, TikaJavaArgs, serverHost, port, classpath, config_path)
if not status:
log.error("Failed to receive startup confirmation from startServer.")
raise RuntimeError("Unable to start Tika server.")
return serverEndpoint
def checkJarSig(tikaServerJar, jarPath):
'''
Checks the signature of Jar
:param tikaServerJar:
:param jarPath:
:return: ``True`` if the signature of the jar matches
'''
if not os.path.isfile(jarPath + ".md5"):
getRemoteJar(tikaServerJar + ".md5", jarPath + ".md5")
m = hashlib.md5()
with open(jarPath, 'rb') as f:
binContents = f.read()
m.update(binContents)
with open(jarPath + ".md5", "r") as em:
existingContents = em.read()
return existingContents == m.hexdigest()
def startServer(tikaServerJar, java_path = TikaJava, java_args = TikaJavaArgs, serverHost = ServerHost, port = Port, classpath=None, config_path=None):
'''
Starts Tika Server
:param tikaServerJar: path to tika server jar
:param serverHost: the host interface address to be used for binding the service
:param port: the host port to be used for binding the service
:param classpath: Class path value to pass to JVM
:return: None
'''
if classpath is None:
classpath = TikaServerClasspath
host = "localhost"
if Windows:
host = "0.0.0.0"
if classpath:
classpath += ":" + tikaServerJar
else:
classpath = tikaServerJar
# setup command string
cmd_string = ""
if not config_path:
cmd_string = '%s %s -cp %s org.apache.tika.server.TikaServerCli --port %s --host %s &' \
% (java_path, java_args, classpath, port, host)
else:
cmd_string = '%s %s -cp %s org.apache.tika.server.TikaServerCli --port %s --host %s --config %s &' \
% (java_path, java_args, classpath, port, host, config_path)
# Check that we can write to log path
try:
tika_log_file_path = os.path.join(TikaServerLogFilePath, 'tika-server.log')
logFile = open(tika_log_file_path, 'w')
except PermissionError as e:
log.error("Unable to create tika-server.log at %s due to permission error." % (TikaServerLogFilePath))
return False
# Check that specified java binary is available on path
try:
_ = Popen(java_path, stdout=open(os.devnull, "w"), stderr=open(os.devnull, "w"))
except FileNotFoundError as e:
log.error("Unable to run java; is it installed?")
return False
# Run java with jar args
cmd = Popen(cmd_string, stdout=logFile, stderr=STDOUT, shell=True)
# Check logs and retry as configured
try_count = 0
is_started = False
while try_count < TikaStartupMaxRetry:
with open(tika_log_file_path, "r") as tika_log_file_tmp:
# check for INFO string to confirm listening endpoint
if "Started Apache Tika server at" in tika_log_file_tmp.read():
is_started = True
else:
log.warning("Failed to see startup log message; retrying...")
time.sleep(TikaStartupSleep)
try_count += 1
if not is_started:
log.error("Tika startup log message not received after %d tries." % (TikaStartupMaxRetry))
return False
else:
return True
def toFilename(url):
'''
gets url and returns filename
'''
urlp = urlparse(url)
path = urlp.path
if not path:
path = "file_{}".format(int(time.time()))
value = re.sub(r'[^\w\s\.\-]', '-', path).strip().lower()
return re.sub(r'[-\s]+', '-', value).strip("-")[-200:]
def getRemoteFile(urlOrPath, destPath):
'''
Fetches URL to local path or just returns absolute path.
:param urlOrPath: resource locator, generally URL or path
:param destPath: path to store the resource, usually a path on file system
:return: tuple having (path, 'local'/'remote')
'''
urlp = urlparse(urlOrPath)
if urlp.scheme == '':
return (os.path.abspath(urlOrPath), 'local')
elif urlp.scheme not in ('http', 'https'):
return (urlOrPath, 'local')
else:
filename = toFilename(urlOrPath)
destPath = destPath + '/' + filename
log.info('Retrieving %s to %s.' % (urlOrPath, destPath))
try:
urlretrieve(urlOrPath, destPath)
except IOError:
# monkey patch fix for SSL/Windows per Tika-Python #54
# https://github.com/chrismattmann/tika-python/issues/54
import ssl
if hasattr(ssl, '_create_unverified_context'):
ssl._create_default_https_context = ssl._create_unverified_context
# delete whatever we had there
if os.path.exists(destPath) and os.path.isfile(destPath):
os.remove(destPath)
urlretrieve(urlOrPath, destPath)
return (destPath, 'remote')
def getRemoteJar(urlOrPath, destPath):
'''
Fetches URL to local path or just return absolute path.
:param urlOrPath: remote resource locator
:param destPath: Path to store the resource, usually a path on file system
:return: tuple having (path, 'local'/'remote')
'''
urlp = urlparse(urlOrPath)
if urlp.scheme == '':
return (os.path.abspath(urlOrPath), 'local')
else:
log.info('Retrieving %s to %s.' % (urlOrPath, destPath))
try:
urlretrieve(urlOrPath, destPath)
except IOError:
# monkey patch fix for SSL/Windows per Tika-Python #54
# https://github.com/chrismattmann/tika-python/issues/54
import ssl
if hasattr(ssl, '_create_unverified_context'):
ssl._create_default_https_context = ssl._create_unverified_context
# delete whatever we had there
if os.path.exists(destPath) and os.path.isfile(destPath):
os.remove(destPath)
urlretrieve(urlOrPath, destPath)
return (destPath, 'remote')
def checkPortIsOpen(remoteServerHost=ServerHost, port = Port):
'''
Checks if the specified port is open
:param remoteServerHost: the host address
:param port: port which needs to be checked
:return: ``True`` if port is open, ``False`` otherwise
'''
remoteServerIP = socket.gethostbyname(remoteServerHost)
try:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
result = sock.connect_ex((remoteServerIP, int(port)))
if result == 0:
return True
else :
return False
except KeyboardInterrupt:
print("You pressed Ctrl+C")
sys.exit()
except socket.gaierror:
print('Hostname could not be resolved. Exiting')
sys.exit()
except socket.error:
print("Couldn't connect to server")
sys.exit()
finally:
sock.close()
def main(argv=None):
"""Run Tika from command line according to USAGE."""
global Verbose
global EncodeUtf8
global csvOutput
if argv is None:
argv = sys.argv
if (len(argv) < 3 and not (('-h' in argv) or ('--help' in argv))):
log.exception('Bad args')
raise TikaException('Bad args')
try:
opts, argv = getopt.getopt(argv[1:], 'hi:s:o:p:v:e:c',
['help', 'install=', 'server=', 'output=', 'port=', 'verbose', 'encode', 'csv'])
except getopt.GetoptError as opt_error:
msg, bad_opt = opt_error
log.exception("%s error: Bad option: %s, %s" % (argv[0], bad_opt, msg))
raise TikaException("%s error: Bad option: %s, %s" % (argv[0], bad_opt, msg))
tikaServerJar = TikaServerJar
serverHost = ServerHost
outDir = '.'
port = Port
for opt, val in opts:
if opt in ('-h', '--help'): echo2(USAGE); sys.exit()
elif opt in ('--install'): tikaServerJar = val
elif opt in ('--server'): serverHost = val
elif opt in ('-o', '--output'): outDir = val
elif opt in ('--port'): port = val
elif opt in ('-v', '--verbose'): Verbose = 1
elif opt in ('-e', '--encode'): EncodeUtf8 = 1
elif opt in ('-c', '--csv'): csvOutput = 1
else:
raise TikaException(USAGE)
cmd = argv[0]
option = argv[1]
try:
paths = argv[2:]
except:
paths = None
return runCommand(cmd, option, paths, port, outDir, serverHost=serverHost, tikaServerJar=tikaServerJar, verbose=Verbose, encode=EncodeUtf8)
if __name__ == '__main__':
log.info("Logging on '%s'" % (log_file))
resp = main(sys.argv)
# Set encoding of the terminal to UTF-8
if sys.version.startswith("2"):
# Python 2.x
out = codecs.getwriter("UTF-8")(sys.stdout)
elif sys.version.startswith("3"):
# Python 3.x
out = codecs.getwriter("UTF-8")(sys.stdout.buffer)
if type(resp) == list:
out.write('\n'.join([r[1] for r in resp]))
else:
out.write(resp)
out.write('\n')