PCQRSCANER/venv/Lib/site-packages/textract/parsers/epub_parser.py

from ebooklib import epub, ITEM_DOCUMENT
from bs4 import BeautifulSoup

from .utils import BaseParser


class Parser(BaseParser):
    """Extract text from epub using python epub library
    """

    def extract(self, filename, **kwargs):
        book = epub.read_epub(filename)
        result = ''
        for id, _ in book.spine:
            item = book.get_item_with_id(id)
            soup = BeautifulSoup(item.content, 'lxml')
            for child in soup.find_all(
                ['title', 'p', 'div', 'h1', 'h2', 'h3', 'h4']
            ):
                result = result + child.text + '\n'
        return result
3 2019-12-22 21:51:47 +01:00			`from ebooklib import epub, ITEM_DOCUMENT`
			`from bs4 import BeautifulSoup`

			`from .utils import BaseParser`


			`class Parser(BaseParser):`
			`"""Extract text from epub using python epub library`
			`"""`

			`def extract(self, filename, **kwargs):`
			`book = epub.read_epub(filename)`
			`result = ''`
			`for id, _ in book.spine:`
			`item = book.get_item_with_id(id)`
			`soup = BeautifulSoup(item.content, 'lxml')`
			`for child in soup.find_all(`
			`['title', 'p', 'div', 'h1', 'h2', 'h3', 'h4']`
			`):`
			`result = result + child.text + '\n'`
			`return result`