# cython: language_level=2 """A cleanup tool for HTML. Removes unwanted tags and content. See the `Cleaner` class for details. """ from __future__ import absolute_import import re import copy try: from urlparse import urlsplit from urllib import unquote_plus except ImportError: # Python 3 from urllib.parse import urlsplit, unquote_plus from lxml import etree from lxml.html import defs from lxml.html import fromstring, XHTML_NAMESPACE from lxml.html import xhtml_to_html, _transform_result try: unichr except NameError: # Python 3 unichr = chr try: unicode except NameError: # Python 3 unicode = str try: basestring except NameError: basestring = (str, bytes) __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', 'word_break', 'word_break_html'] # Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl # Particularly the CSS cleaning; most of the tag cleaning is integrated now # I have multiple kinds of schemes searched; but should schemes be # whitelisted instead? # max height? # remove images? Also in CSS? background attribute? # Some way to whitelist object, iframe, etc (e.g., if you want to # allow *just* embedded YouTube movies) # Log what was deleted and why? # style="behavior: ..." might be bad in IE? # Should we have something for just ? That's the worst of the # metas. # UTF-7 detections? Example: # +ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4- # you don't always have to have the charset set, if the page has no charset # and there's UTF7-like code in it. # Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php # This is an IE-specific construct you can have in a stylesheet to # run some Javascript: _css_javascript_re = re.compile( r'expression\s*\(.*?\)', re.S|re.I) # Do I have to worry about @\nimport? _css_import_re = re.compile( r'@\s*import', re.I) # All kinds of schemes besides just javascript: that can cause # execution: _is_image_dataurl = re.compile( r'^data:image/.+;base64', re.I).search _is_possibly_malicious_scheme = re.compile( r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):', re.I).search def _is_javascript_scheme(s): if _is_image_dataurl(s): return None return _is_possibly_malicious_scheme(s) _substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub # FIXME: should data: be blocked? # FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx _conditional_comment_re = re.compile( r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) _find_styled_elements = etree.XPath( "descendant-or-self::*[@style]") _find_external_links = etree.XPath( ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |" "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"), namespaces={'x':XHTML_NAMESPACE}) class Cleaner(object): """ Instances cleans the document of each of the possible offending elements. The cleaning is controlled by attributes; you can override attributes in a subclass, or set them in the constructor. ``scripts``: Removes any ``