PCQRSCANER/venv/Lib/site-packages/textract/parsers/epub_parser.py
2019-12-22 21:51:47 +01:00

22 lines
622 B
Python

from ebooklib import epub, ITEM_DOCUMENT
from bs4 import BeautifulSoup
from .utils import BaseParser
class Parser(BaseParser):
"""Extract text from epub using python epub library
"""
def extract(self, filename, **kwargs):
book = epub.read_epub(filename)
result = ''
for id, _ in book.spine:
item = book.get_item_with_id(id)
soup = BeautifulSoup(item.content, 'lxml')
for child in soup.find_all(
['title', 'p', 'div', 'h1', 'h2', 'h3', 'h4']
):
result = result + child.text + '\n'
return result