117 lines
4.4 KiB
Python
117 lines
4.4 KiB
Python
|
#!/usr/bin/env python
|
||
|
# encoding: utf-8
|
||
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||
|
# contributor license agreements. See the NOTICE file distributed with
|
||
|
# this work for additional information regarding copyright ownership.
|
||
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||
|
# (the "License"); you may not use this file except in compliance with
|
||
|
# the License. You may obtain a copy of the License at
|
||
|
#
|
||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
#
|
||
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
# See the License for the specific language governing permissions and
|
||
|
# limitations under the License.
|
||
|
#
|
||
|
|
||
|
from .tika import parse1, callServer, ServerEndpoint
|
||
|
import tarfile
|
||
|
from io import BytesIO, TextIOWrapper
|
||
|
import csv
|
||
|
from sys import version_info
|
||
|
|
||
|
# Python 3 introduced .readable() to tarfile extracted files objects - this
|
||
|
# is required to wrap a TextIOWrapper around the object. However, wrapping
|
||
|
# with TextIOWrapper is only required for csv.reader() in Python 3, so the
|
||
|
# tarfile returned object can be used as is in earlier versions.
|
||
|
_text_wrapper = TextIOWrapper if version_info.major >= 3 else lambda x: x
|
||
|
|
||
|
|
||
|
def from_file(filename, serverEndpoint=ServerEndpoint, requestOptions={}):
|
||
|
'''
|
||
|
Parse from file
|
||
|
:param filename: file
|
||
|
:param serverEndpoint: Tika server end point (optional)
|
||
|
:return:
|
||
|
'''
|
||
|
tarOutput = parse1('unpack', filename, serverEndpoint,
|
||
|
responseMimeType='application/x-tar',
|
||
|
services={'meta': '/meta', 'text': '/tika',
|
||
|
'all': '/rmeta/xml', 'unpack': '/unpack/all'},
|
||
|
rawResponse=True, requestOptions=requestOptions)
|
||
|
return _parse(tarOutput)
|
||
|
|
||
|
|
||
|
def from_buffer(string, serverEndpoint=ServerEndpoint, requestOptions={}):
|
||
|
'''
|
||
|
Parse from buffered content
|
||
|
:param string: buffered content
|
||
|
:param serverEndpoint: Tika server URL (Optional)
|
||
|
:return: parsed content
|
||
|
'''
|
||
|
status, response = callServer('put', serverEndpoint, '/unpack/all', string,
|
||
|
{'Accept': 'application/x-tar'}, False,
|
||
|
rawResponse=True, requestOptions=requestOptions)
|
||
|
|
||
|
return _parse((status, response))
|
||
|
|
||
|
|
||
|
def _parse(tarOutput):
|
||
|
parsed = {}
|
||
|
if not tarOutput:
|
||
|
return parsed
|
||
|
elif tarOutput[1] is None or tarOutput[1] == b"":
|
||
|
return parsed
|
||
|
|
||
|
tarFile = tarfile.open(fileobj=BytesIO(tarOutput[1]))
|
||
|
|
||
|
# get the member names
|
||
|
memberNames = list(tarFile.getnames())
|
||
|
|
||
|
# extract the metadata
|
||
|
metadata = {}
|
||
|
if "__METADATA__" in memberNames:
|
||
|
memberNames.remove("__METADATA__")
|
||
|
|
||
|
metadataMember = tarFile.getmember("__METADATA__")
|
||
|
if not metadataMember.issym() and metadataMember.isfile():
|
||
|
metadataFile = _text_wrapper(tarFile.extractfile(metadataMember))
|
||
|
metadataReader = csv.reader(metadataFile)
|
||
|
for metadataLine in metadataReader:
|
||
|
# each metadata line comes as a key-value pair, with list values
|
||
|
# returned as extra values in the line - convert single values
|
||
|
# to non-list values to be consistent with parser metadata
|
||
|
assert len(metadataLine) >= 2
|
||
|
|
||
|
if len(metadataLine) > 2:
|
||
|
metadata[metadataLine[0]] = metadataLine[1:]
|
||
|
else:
|
||
|
metadata[metadataLine[0]] = metadataLine[1]
|
||
|
|
||
|
# get the content
|
||
|
content = ""
|
||
|
if "__TEXT__" in memberNames:
|
||
|
memberNames.remove("__TEXT__")
|
||
|
|
||
|
contentMember = tarFile.getmember("__TEXT__")
|
||
|
if not contentMember.issym() and contentMember.isfile():
|
||
|
if version_info.major >= 3:
|
||
|
content = _text_wrapper(tarFile.extractfile(contentMember), encoding='utf8').read()
|
||
|
else:
|
||
|
content = tarFile.extractfile(contentMember).read().decode('utf8')
|
||
|
|
||
|
# get the remaining files as attachments
|
||
|
attachments = {}
|
||
|
for attachment in memberNames:
|
||
|
attachmentMember = tarFile.getmember(attachment)
|
||
|
if not attachmentMember.issym() and attachmentMember.isfile():
|
||
|
attachments[attachment] = tarFile.extractfile(attachmentMember).read()
|
||
|
|
||
|
parsed["content"] = content
|
||
|
parsed["metadata"] = metadata
|
||
|
parsed["attachments"] = attachments
|
||
|
|
||
|
return parsed
|