# This file is part of EbookLib. # Copyright (c) 2013 Aleksandar Erkalovic # # EbookLib is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # EbookLib is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with EbookLib. If not, see . import six from ebooklib.plugins.base import BasePlugin from ebooklib.utils import parse_html_string # TODO: # - should also look for the _required_ elements # http://www.w3.org/html/wg/drafts/html/master/tabular-data.html#the-table-element ATTRIBUTES_GLOBAL = ['accesskey', 'class', 'contenteditable', 'contextmenu', 'dir', 'draggable', 'dropzone', 'hidden', 'id', 'inert', 'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype', 'lang', 'spellcheck', 'style', 'tabindex', 'title', 'translate', 'epub:type'] # Remove for now from here DEPRECATED_TAGS = ['acronym', 'applet', 'basefont', 'big', 'center', 'dir', 'font', 'frame', 'frameset', 'isindex', 'noframes', 's', 'strike', 'tt'] def leave_only(item, tag_list): for _attr in six.iterkeys(item.attrib): if _attr not in tag_list: del item.attrib[_attr] class SyntaxPlugin(BasePlugin): NAME = 'Check HTML syntax' def html_before_write(self, book, chapter): from lxml import etree try: tree = parse_html_string(chapter.content) except: return root = tree.getroottree() # delete deprecated tags # i should really have a list of allowed tags for tag in DEPRECATED_TAGS: etree.strip_tags(root, tag) head = tree.find('head') if head is not None and len(head) != 0: for _item in head: if _item.tag == 'base': leave_only(_item, ATTRIBUTES_GLOBAL + ['href', 'target']) elif _item.tag == 'link': leave_only(_item, ATTRIBUTES_GLOBAL + ['href', 'crossorigin', 'rel', 'media', 'hreflang', 'type', 'sizes']) elif _item.tag == 'title': if _item.text == '': head.remove(_item) elif _item.tag == 'meta': leave_only(_item, ATTRIBUTES_GLOBAL + ['name', 'http-equiv', 'content', 'charset']) # just remove for now, but really should not be like this head.remove(_item) elif _item.tag == 'script': leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'type', 'charset', 'async', 'defer', 'crossorigin']) elif _item.tag == 'source': leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'type', 'media']) elif _item.tag == 'style': leave_only(_item, ATTRIBUTES_GLOBAL + ['media', 'type', 'scoped']) else: leave_only(_item, ATTRIBUTES_GLOBAL) if len(root.find('body')) != 0: body = tree.find('body') for _item in body.iter(): # it is not # if _item.tag == 'a': leave_only(_item, ATTRIBUTES_GLOBAL + ['href', 'target', 'download', 'rel', 'hreflang', 'type']) elif _item.tag == 'area': leave_only(_item, ATTRIBUTES_GLOBAL + ['alt', 'coords', 'shape', 'href', 'target', 'download', 'rel', 'hreflang', 'type']) elif _item.tag == 'audio': leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'crossorigin', 'preload', 'autoplay', 'mediagroup', 'loop', 'muted', 'controls']) elif _item.tag == 'blockquote': leave_only(_item, ATTRIBUTES_GLOBAL + ['cite']) elif _item.tag == 'button': leave_only(_item, ATTRIBUTES_GLOBAL + ['autofocus', 'disabled', 'form', 'formaction', 'formenctype', 'formmethod', 'formnovalidate', 'formtarget', 'name', 'type', 'value', 'menu']) elif _item.tag == 'canvas': leave_only(_item, ATTRIBUTES_GLOBAL + ['width', 'height']) elif _item.tag == 'canvas': leave_only(_item, ATTRIBUTES_GLOBAL + ['width', 'height']) elif _item.tag == 'del': leave_only(_item, ATTRIBUTES_GLOBAL + ['cite', 'datetime']) elif _item.tag == 'details': leave_only(_item, ATTRIBUTES_GLOBAL + ['open']) elif _item.tag == 'embed': leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'type', 'width', 'height']) elif _item.tag == 'fieldset': leave_only(_item, ATTRIBUTES_GLOBAL + ['disable', 'form', 'name']) elif _item.tag == 'details': leave_only(_item, ATTRIBUTES_GLOBAL + ['accept-charset', 'action', 'autocomplete', 'enctype', 'method', 'name', 'novalidate', 'target']) elif _item.tag == 'iframe': leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'srcdoc', 'name', 'sandbox', 'seamless', 'allowfullscreen', 'width', 'height']) elif _item.tag == 'img': _src = _item.get('src', '').lower() if _src.startswith('http://') or _src.startswith('https://'): if 'remote-resources' not in chapter.properties: chapter.properties.append('remote-resources') # THIS DOES NOT WORK, ONLY VIDEO AND AUDIO FILES CAN BE REMOTE RESOURCES # THAT MEANS I SHOULD ALSO CATCH