58 lines
1.9 KiB
Python
58 lines
1.9 KiB
Python
|
import zipfile
|
||
|
import xml.etree.ElementTree as ET
|
||
|
|
||
|
from .utils import BaseParser
|
||
|
|
||
|
|
||
|
class Parser(BaseParser):
|
||
|
"""Extract text from open document files.
|
||
|
"""
|
||
|
|
||
|
def extract(self, filename, **kwargs):
|
||
|
# Inspiration from
|
||
|
# https://github.com/odoo/odoo/blob/master/addons/document/odt2txt.py
|
||
|
with open(filename, 'rb') as stream:
|
||
|
zip_stream = zipfile.ZipFile(stream)
|
||
|
self.content = ET.fromstring(zip_stream.read("content.xml"))
|
||
|
return self.to_string()
|
||
|
|
||
|
def to_string(self):
|
||
|
""" Converts the document to a string. """
|
||
|
buff = u""
|
||
|
for child in self.content.iter():
|
||
|
if child.tag in [self.qn('text:p'), self.qn('text:h')]:
|
||
|
buff += self.text_to_string(child) + "\n"
|
||
|
# remove last newline char
|
||
|
if buff:
|
||
|
buff = buff[:-1]
|
||
|
return buff
|
||
|
|
||
|
def text_to_string(self, element):
|
||
|
buff = u""
|
||
|
if element.text is not None:
|
||
|
buff += element.text
|
||
|
for child in element:
|
||
|
if child.tag == self.qn('text:tab'):
|
||
|
buff += "\t"
|
||
|
if child.tail is not None:
|
||
|
buff += child.tail
|
||
|
elif child.tag == self.qn('text:s'):
|
||
|
buff += u" "
|
||
|
if child.get(self.qn('text:c')) is not None:
|
||
|
buff += u" " * (int(child.get(self.qn('text:c'))) - 1)
|
||
|
if child.tail is not None:
|
||
|
buff += child.tail
|
||
|
else:
|
||
|
buff += self.text_to_string(child)
|
||
|
if element.tail is not None:
|
||
|
buff += element.tail
|
||
|
return buff
|
||
|
|
||
|
def qn(self, namespace):
|
||
|
"""Connect tag prefix to longer namespace"""
|
||
|
nsmap = {
|
||
|
'text': 'urn:oasis:names:tc:opendocument:xmlns:text:1.0',
|
||
|
}
|
||
|
spl = namespace.split(':')
|
||
|
return '{{{}}}{}'.format(nsmap[spl[0]], spl[1])
|