# Copyright (c) 2004 Ian Bicking. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # # 3. Neither the name of Ian Bicking nor the names of its contributors may # be used to endorse or promote products derived from this software # without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """The ``lxml.html`` tool set for HTML handling. """ from __future__ import absolute_import __all__ = [ 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 'find_rel_links', 'find_class', 'make_links_absolute', 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] import copy import sys import re from functools import partial try: from collections.abc import MutableMapping, MutableSet except ImportError: from collections import MutableMapping, MutableSet from .. import etree from . import defs from ._setmixin import SetMixin try: from urlparse import urljoin except ImportError: # Python 3 from urllib.parse import urljoin try: unicode except NameError: # Python 3 unicode = str try: basestring except NameError: # Python 3 basestring = (str, bytes) def __fix_docstring(s): if not s: return s if sys.version_info[0] >= 3: sub = re.compile(r"^(\s*)u'", re.M).sub else: sub = re.compile(r"^(\s*)b'", re.M).sub return sub(r"\1'", s) XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", namespaces={'x':XHTML_NAMESPACE}) _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", namespaces={'x':XHTML_NAMESPACE}) _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", namespaces={'x':XHTML_NAMESPACE}) #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") _collect_string_content = etree.XPath("string()") _iter_css_urls = re.compile(r'url$('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)$', re.I).finditer _iter_css_imports = re.compile(r'@import "(.*?)"').finditer _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", namespaces={'x':XHTML_NAMESPACE}) _archive_re = re.compile(r'[^ ]+') _parse_meta_refresh_url = re.compile( r'[^;=]*;\s*(?:url\s*=\s*)?(?P.*)$', re.I).search def _unquote_match(s, pos): if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": return s[1:-1], pos+1 else: return s,pos def _transform_result(typ, result): """Convert the result back into the input type. """ if issubclass(typ, bytes): return tostring(result, encoding='utf-8') elif issubclass(typ, unicode): return tostring(result, encoding='unicode') else: return result def _nons(tag): if isinstance(tag, basestring): if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: return tag.split('}')[-1] return tag class Classes(MutableSet): """Provides access to an element's class attribute as a set-like collection. Usage:: >>> el = fromstring('') >>> classes = el.classes # or: classes = Classes(el.attrib) >>> classes |= ['block', 'paragraph'] >>> el.get('class') 'hidden large block paragraph' >>> classes.toggle('hidden') False >>> el.get('class') 'large block paragraph' >>> classes -= ('some', 'classes', 'block') >>> el.get('class') 'large paragraph' """ def __init__(self, attributes): self._attributes = attributes self._get_class_value = partial(attributes.get, 'class', '') def add(self, value): """ Add a class. This has no effect if the class is already present. """ if not value or re.search(r'\s', value): raise ValueError("Invalid class name: %r" % value) classes = self._get_class_value().split() if value in classes: return classes.append(value) self._attributes['class'] = ' '.join(classes) def discard(self, value): """ Remove a class if it is currently present. If the class is not present, do nothing. """ if not value or re.search(r'\s', value): raise ValueError("Invalid class name: %r" % value) classes = [name for name in self._get_class_value().split() if name != value] if classes: self._attributes['class'] = ' '.join(classes) elif 'class' in self._attributes: del self._attributes['class'] def remove(self, value): """ Remove a class; it must currently be present. If the class is not present, raise a KeyError. """ if not value or re.search(r'\s', value): raise ValueError("Invalid class name: %r" % value) super(Classes, self).remove(value) def __contains__(self, name): classes = self._get_class_value() return name in classes and name in classes.split() def __iter__(self): return iter(self._get_class_value().split()) def __len__(self): return len(self._get_class_value().split()) # non-standard methods def update(self, values): """ Add all names from 'values'. """ classes = self._get_class_value().split() extended = False for value in values: if value not in classes: classes.append(value) extended = True if extended: self._attributes['class'] = ' '.join(classes) def toggle(self, value): """ Add a class name if it isn't there yet, or remove it if it exists. Returns true if the class was added (and is now enabled) and false if it was removed (and is now disabled). """ if not value or re.search(r'\s', value): raise ValueError("Invalid class name: %r" % value) classes = self._get_class_value().split() try: classes.remove(value) enabled = False except ValueError: classes.append(value) enabled = True if classes: self._attributes['class'] = ' '.join(classes) else: del self._attributes['class'] return enabled class HtmlMixin(object): def set(self, key, value=None): """set(self, key, value=None) Sets an element attribute. If no value is provided, or if the value is None, creates a 'boolean' attribute without value, e.g. "" for ``form.set('novalidate')``. """ super(HtmlElement, self).set(key, value) @property def classes(self): """ A set-like wrapper around the 'class' attribute. """ return Classes(self.attrib) @classes.setter def classes(self, classes): assert isinstance(classes, Classes) # only allow "el.classes |= ..." etc. value = classes._get_class_value() if value: self.set('class', value) elif self.get('class') is not None: del self.attrib['class'] @property def base_url(self): """ Returns the base URL, given when the page was parsed. Use with ``urlparse.urljoin(el.base_url, href)`` to get absolute URLs. """ return self.getroottree().docinfo.URL @property def forms(self): """ Return a list of all the forms """ return _forms_xpath(self) @property def body(self): """ Return the element. Can be called from a child element to get the document's head. """ return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0] @property def head(self): """ Returns the element. Can be called from a child element to get the document's head. """ return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0] @property def label(self): """ Get or set any element associated with this element. """ id = self.get('id') if not id: return None result = _label_xpath(self, id=id) if not result: return None else: return result[0] @label.setter def label(self, label): id = self.get('id') if not id: raise TypeError( "You cannot set a label for an element (%r) that has no id" % self) if _nons(label.tag) != 'label': raise TypeError( "You can only assign label to a label element (not %r)" % label) label.set('for', id) @label.deleter def label(self): label = self.label if label is not None: del label.attrib['for'] def drop_tree(self): """ Removes this element from the tree, including its children and text. The tail text is joined to the previous element or parent. """ parent = self.getparent() assert parent is not None if self.tail: previous = self.getprevious() if previous is None: parent.text = (parent.text or '') + self.tail else: previous.tail = (previous.tail or '') + self.tail parent.remove(self) def drop_tag(self): """ Remove the tag, but not its children or text. The children and text are merged into the parent. Example:: >>> h = fragment_fromstring('

Hello World!

') >>> h.find('.//b').drop_tag() >>> print(tostring(h, encoding='unicode'))

Hello World!

""" parent = self.getparent() assert parent is not None previous = self.getprevious() if self.text and isinstance(self.tag, basestring): # not a Comment, etc. if previous is None: parent.text = (parent.text or '') + self.text else: previous.tail = (previous.tail or '') + self.text if self.tail: if len(self): last = self[-1] last.tail = (last.tail or '') + self.tail elif previous is None: parent.text = (parent.text or '') + self.tail else: previous.tail = (previous.tail or '') + self.tail index = parent.index(self) parent[index:index+1] = self[:] def find_rel_links(self, rel): """ Find any links like ``...``; returns a list of elements. """ rel = rel.lower() return [el for el in _rel_links_xpath(self) if el.get('rel').lower() == rel] def find_class(self, class_name): """ Find any elements with the given class name. """ return _class_xpath(self, class_name=class_name) def get_element_by_id(self, id, *default): """ Get the first element in a document with the given id. If none is found, return the default argument if provided or raise KeyError otherwise. Note that there can be more than one element with the same id, and this isn't uncommon in HTML documents found in the wild. Browsers return only the first match, and this function does the same. """ try: # FIXME: should this check for multiple matches? # browsers just return the first one return _id_xpath(self, id=id)[0] except IndexError: if default: return default[0] else: raise KeyError(id) def text_content(self): """ Return the text content of the tag (and the text in any children). """ return _collect_string_content(self) def cssselect(self, expr, translator='html'): """ Run the CSS expression on this element and its children, returning a list of the results. Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self) -- note that pre-compiling the expression can provide a substantial speedup. """ # Do the import here to make the dependency optional. from lxml.cssselect import CSSSelector return CSSSelector(expr, translator=translator)(self) ######################################## ## Link functions ######################################## def make_links_absolute(self, base_url=None, resolve_base_href=True, handle_failures=None): """ Make all links in the document absolute, given the ``base_url`` for the document (the full URL where the document came from), or if no ``base_url`` is given, then the ``.base_url`` of the document. If ``resolve_base_href`` is true, then any ```` tags in the document are used *and* removed from the document. If it is false then any such tag is ignored. If ``handle_failures`` is None (default), a failure to process a URL will abort the processing. If set to 'ignore', errors are ignored. If set to 'discard', failing URLs will be removed. """ if base_url is None: base_url = self.base_url if base_url is None: raise TypeError( "No base_url given, and the document has no base_url") if resolve_base_href: self.resolve_base_href() if handle_failures == 'ignore': def link_repl(href): try: return urljoin(base_url, href) except ValueError: return href elif handle_failures == 'discard': def link_repl(href): try: return urljoin(base_url, href) except ValueError: return None elif handle_failures is None: def link_repl(href): return urljoin(base_url, href) else: raise ValueError( "unexpected value for handle_failures: %r" % handle_failures) self.rewrite_links(link_repl) def resolve_base_href(self, handle_failures=None): """ Find any ```` tag in the document, and apply its values to all links found in the document. Also remove the tag once it has been applied. If ``handle_failures`` is None (default), a failure to process a URL will abort the processing. If set to 'ignore', errors are ignored. If set to 'discard', failing URLs will be removed. """ base_href = None basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x': XHTML_NAMESPACE}) for b in basetags: base_href = b.get('href') b.drop_tree() if not base_href: return self.make_links_absolute(base_href, resolve_base_href=False, handle_failures=handle_failures) def iterlinks(self): """ Yield (element, attribute, link, pos), where attribute may be None (indicating the link is in the text). ``pos`` is the position where the link occurs; often 0, but sometimes something else in the case of links in stylesheets or style tags. Note: is *not* taken into account in any way. The link you get is exactly the link in the document. Note: multiple links inside of a single text string or attribute value are returned in reversed order. This makes it possible to replace or delete them from the text string value based on their reported text positions. Otherwise, a modification at one text position can change the positions of links reported later on. """ link_attrs = defs.link_attrs for el in self.iter(etree.Element): attribs = el.attrib tag = _nons(el.tag) if tag == 'object': codebase = None ##