PCQRSCANER/venv/Lib/site-packages/textract/parsers/odt_parser.py

import zipfile
import xml.etree.ElementTree as ET

from .utils import BaseParser


class Parser(BaseParser):
    """Extract text from open document files.
    """

    def extract(self, filename, **kwargs):
        # Inspiration from
        # https://github.com/odoo/odoo/blob/master/addons/document/odt2txt.py
        with open(filename, 'rb') as stream:
            zip_stream = zipfile.ZipFile(stream)
            self.content = ET.fromstring(zip_stream.read("content.xml"))
        return self.to_string()

    def to_string(self):
        """ Converts the document to a string. """
        buff = u""
        for child in self.content.iter():
            if child.tag in [self.qn('text:p'), self.qn('text:h')]:
                buff += self.text_to_string(child) + "\n"
        # remove last newline char
        if buff:
            buff = buff[:-1]
        return buff

    def text_to_string(self, element):
        buff = u""
        if element.text is not None:
            buff += element.text
        for child in element:
            if child.tag == self.qn('text:tab'):
                buff += "\t"
                if child.tail is not None:
                    buff += child.tail
            elif child.tag == self.qn('text:s'):
                buff += u" "
                if child.get(self.qn('text:c')) is not None:
                    buff += u" " * (int(child.get(self.qn('text:c'))) - 1)
                if child.tail is not None:
                    buff += child.tail
            else:
                buff += self.text_to_string(child)
        if element.tail is not None:
            buff += element.tail
        return buff

    def qn(self, namespace):
        """Connect tag prefix to longer namespace"""
        nsmap = {
            'text': 'urn:oasis:names:tc:opendocument:xmlns:text:1.0',
        }
        spl = namespace.split(':')
        return '{{{}}}{}'.format(nsmap[spl[0]], spl[1])
3 2019-12-22 21:51:47 +01:00			`import zipfile`
			`import xml.etree.ElementTree as ET`

			`from .utils import BaseParser`


			`class Parser(BaseParser):`
			`"""Extract text from open document files.`
			`"""`

			`def extract(self, filename, **kwargs):`
			`# Inspiration from`
			`# https://github.com/odoo/odoo/blob/master/addons/document/odt2txt.py`
			`with open(filename, 'rb') as stream:`
			`zip_stream = zipfile.ZipFile(stream)`
			`self.content = ET.fromstring(zip_stream.read("content.xml"))`
			`return self.to_string()`

			`def to_string(self):`
			`""" Converts the document to a string. """`
			`buff = u""`
			`for child in self.content.iter():`
			`if child.tag in [self.qn('text:p'), self.qn('text:h')]:`
			`buff += self.text_to_string(child) + "\n"`
			`# remove last newline char`
			`if buff:`
			`buff = buff[:-1]`
			`return buff`

			`def text_to_string(self, element):`
			`buff = u""`
			`if element.text is not None:`
			`buff += element.text`
			`for child in element:`
			`if child.tag == self.qn('text:tab'):`
			`buff += "\t"`
			`if child.tail is not None:`
			`buff += child.tail`
			`elif child.tag == self.qn('text:s'):`
			`buff += u" "`
			`if child.get(self.qn('text:c')) is not None:`
			`buff += u" " * (int(child.get(self.qn('text:c'))) - 1)`
			`if child.tail is not None:`
			`buff += child.tail`
			`else:`
			`buff += self.text_to_string(child)`
			`if element.tail is not None:`
			`buff += element.tail`
			`return buff`

			`def qn(self, namespace):`
			`"""Connect tag prefix to longer namespace"""`
			`nsmap = {`
			`'text': 'urn:oasis:names:tc:opendocument:xmlns:text:1.0',`
			`}`
			`spl = namespace.split(':')`
			`return '{{{}}}{}'.format(nsmap[spl[0]], spl[1])`