22 lines
622 B
Python
22 lines
622 B
Python
from ebooklib import epub, ITEM_DOCUMENT
|
|
from bs4 import BeautifulSoup
|
|
|
|
from .utils import BaseParser
|
|
|
|
|
|
class Parser(BaseParser):
|
|
"""Extract text from epub using python epub library
|
|
"""
|
|
|
|
def extract(self, filename, **kwargs):
|
|
book = epub.read_epub(filename)
|
|
result = ''
|
|
for id, _ in book.spine:
|
|
item = book.get_item_with_id(id)
|
|
soup = BeautifulSoup(item.content, 'lxml')
|
|
for child in soup.find_all(
|
|
['title', 'p', 'div', 'h1', 'h2', 'h3', 'h4']
|
|
):
|
|
result = result + child.text + '\n'
|
|
return result
|