PCQRSCANER/venv/Lib/site-packages/textract/parsers/__init__.py

"""
Route the request to the appropriate parser based on file type.
"""

import os
import importlib
import glob
import re

from .. import exceptions

# Dictionary structure for synonymous file extension types
EXTENSION_SYNONYMS = {
    ".jpeg": ".jpg",
    ".tff": ".tiff",
    ".tif": ".tiff",
    ".htm": ".html",
    "": ".txt",
    ".log": ".txt",
}

# default encoding that is returned by the process method. specify it
# here so the default is used on both the process function and also by
# the command line interface
DEFAULT_ENCODING = 'utf_8'

# filename format
_FILENAME_SUFFIX = '_parser'


def process(filename, encoding=DEFAULT_ENCODING, extension=None, **kwargs):
    """This is the core function used for extracting text. It routes the
    ``filename`` to the appropriate parser and returns the extracted
    text as a byte-string encoded with ``encoding``.
    """

    # make sure the filename exists
    if not os.path.exists(filename):
        raise exceptions.MissingFileError(filename)

    # get the filename extension, which is something like .docx for
    # example, and import the module dynamically using importlib. This
    # is a relative import so the name of the package is necessary
    # normally, file extension will be extracted from the file name
    # if the file name has no extension, then the user can pass the
    # extension as an argument
    if extension:
        ext = extension
        # check if the extension has the leading .
        if not ext.startswith('.'):
            ext = '.' + ext
        ext = ext.lower()
    else:
        _, ext = os.path.splitext(filename)
        ext = ext.lower()

    # check the EXTENSION_SYNONYMS dictionary
    ext = EXTENSION_SYNONYMS.get(ext, ext)

    # to avoid conflicts with packages that are installed globally
    # (e.g. python's json module), all extension parser modules have
    # the _parser extension
    rel_module = ext + _FILENAME_SUFFIX

    # If we can't import the module, the file extension isn't currently
    # supported
    try:
        filetype_module = importlib.import_module(
            rel_module, 'textract.parsers'
        )
    except ImportError:
        raise exceptions.ExtensionNotSupported(ext)

    # do the extraction

    parser = filetype_module.Parser()
    return parser.process(filename, encoding, **kwargs)


def _get_available_extensions():
    """Get a list of available file extensions to make it easy for
    tab-completion and exception handling.
    """
    extensions = []

    # from filenames
    parsers_dir = os.path.join(os.path.dirname(__file__))
    glob_filename = os.path.join(parsers_dir, "*" + _FILENAME_SUFFIX + ".py")
    ext_re = re.compile(glob_filename.replace('*', r"(?P<ext>\w+)"))
    for filename in glob.glob(glob_filename):
        ext_match = ext_re.match(filename)
        ext = ext_match.groups()[0]
        extensions.append(ext)
        extensions.append('.' + ext)

    # from relevant synonyms (don't use the '' synonym)
    for ext in EXTENSION_SYNONYMS.keys():
        if ext:
            extensions.append(ext)
            extensions.append(ext.replace('.', '', 1))
    extensions.sort()
    return extensions
3 2019-12-22 21:51:47 +01:00			`"""`
			`Route the request to the appropriate parser based on file type.`
			`"""`

			`import os`
			`import importlib`
			`import glob`
			`import re`

			`from .. import exceptions`

			`# Dictionary structure for synonymous file extension types`
			`EXTENSION_SYNONYMS = {`
			`".jpeg": ".jpg",`
			`".tff": ".tiff",`
			`".tif": ".tiff",`
			`".htm": ".html",`
			`"": ".txt",`
			`".log": ".txt",`
			`}`

			`# default encoding that is returned by the process method. specify it`
			`# here so the default is used on both the process function and also by`
			`# the command line interface`
			`DEFAULT_ENCODING = 'utf_8'`

			`# filename format`
			`_FILENAME_SUFFIX = '_parser'`


			`def process(filename, encoding=DEFAULT_ENCODING, extension=None, **kwargs):`
			`"""This is the core function used for extracting text. It routes the`
			``filename`` to the appropriate parser and returns the extracted
			text as a byte-string encoded with ``encoding``.
			`"""`

			`# make sure the filename exists`
			`if not os.path.exists(filename):`
			`raise exceptions.MissingFileError(filename)`

			`# get the filename extension, which is something like .docx for`
			`# example, and import the module dynamically using importlib. This`
			`# is a relative import so the name of the package is necessary`
			`# normally, file extension will be extracted from the file name`
			`# if the file name has no extension, then the user can pass the`
			`# extension as an argument`
			`if extension:`
			`ext = extension`
			`# check if the extension has the leading .`
			`if not ext.startswith('.'):`
			`ext = '.' + ext`
			`ext = ext.lower()`
			`else:`
			`_, ext = os.path.splitext(filename)`
			`ext = ext.lower()`

			`# check the EXTENSION_SYNONYMS dictionary`
			`ext = EXTENSION_SYNONYMS.get(ext, ext)`

			`# to avoid conflicts with packages that are installed globally`
			`# (e.g. python's json module), all extension parser modules have`
			`# the _parser extension`
			`rel_module = ext + _FILENAME_SUFFIX`

			`# If we can't import the module, the file extension isn't currently`
			`# supported`
			`try:`
			`filetype_module = importlib.import_module(`
			`rel_module, 'textract.parsers'`
			`)`
			`except ImportError:`
			`raise exceptions.ExtensionNotSupported(ext)`

			`# do the extraction`

			`parser = filetype_module.Parser()`
			`return parser.process(filename, encoding, **kwargs)`


			`def _get_available_extensions():`
			`"""Get a list of available file extensions to make it easy for`
			`tab-completion and exception handling.`
			`"""`
			`extensions = []`

			`# from filenames`
			`parsers_dir = os.path.join(os.path.dirname(__file__))`
			`glob_filename = os.path.join(parsers_dir, "*" + _FILENAME_SUFFIX + ".py")`
			`ext_re = re.compile(glob_filename.replace('*', r"(?P<ext>\w+)"))`
			`for filename in glob.glob(glob_filename):`
			`ext_match = ext_re.match(filename)`
			`ext = ext_match.groups()[0]`
			`extensions.append(ext)`
			`extensions.append('.' + ext)`

			`# from relevant synonyms (don't use the '' synonym)`
			`for ext in EXTENSION_SYNONYMS.keys():`
			`if ext:`
			`extensions.append(ext)`
			`extensions.append(ext.replace('.', '', 1))`
			`extensions.sort()`
			`return extensions`