726d90d871
git-svn-id: http://google-refine.googlecode.com/svn/branches/split-refactor@908 7d457c2a-affb-35e4-300a-418c747d4874
389 lines
12 KiB
Python
389 lines
12 KiB
Python
"""Implementation of the DOM Level 3 'LS-Load' feature."""
|
|
|
|
import copy
|
|
import xml.dom
|
|
|
|
from xml.dom.minicompat import *
|
|
|
|
from xml.dom.NodeFilter import NodeFilter
|
|
|
|
|
|
__all__ = ["DOMBuilder", "DOMEntityResolver", "DOMInputSource"]
|
|
|
|
|
|
class Options:
|
|
"""Features object that has variables set for each DOMBuilder feature.
|
|
|
|
The DOMBuilder class uses an instance of this class to pass settings to
|
|
the ExpatBuilder class.
|
|
"""
|
|
|
|
# Note that the DOMBuilder class in LoadSave constrains which of these
|
|
# values can be set using the DOM Level 3 LoadSave feature.
|
|
|
|
namespaces = 1
|
|
namespace_declarations = True
|
|
validation = False
|
|
external_parameter_entities = True
|
|
external_general_entities = True
|
|
external_dtd_subset = True
|
|
validate_if_schema = False
|
|
validate = False
|
|
datatype_normalization = False
|
|
create_entity_ref_nodes = True
|
|
entities = True
|
|
whitespace_in_element_content = True
|
|
cdata_sections = True
|
|
comments = True
|
|
charset_overrides_xml_encoding = True
|
|
infoset = False
|
|
supported_mediatypes_only = False
|
|
|
|
errorHandler = None
|
|
filter = None
|
|
|
|
|
|
class DOMBuilder:
|
|
entityResolver = None
|
|
errorHandler = None
|
|
filter = None
|
|
|
|
ACTION_REPLACE = 1
|
|
ACTION_APPEND_AS_CHILDREN = 2
|
|
ACTION_INSERT_AFTER = 3
|
|
ACTION_INSERT_BEFORE = 4
|
|
|
|
_legal_actions = (ACTION_REPLACE, ACTION_APPEND_AS_CHILDREN,
|
|
ACTION_INSERT_AFTER, ACTION_INSERT_BEFORE)
|
|
|
|
def __init__(self):
|
|
self._options = Options()
|
|
|
|
def _get_entityResolver(self):
|
|
return self.entityResolver
|
|
def _set_entityResolver(self, entityResolver):
|
|
self.entityResolver = entityResolver
|
|
|
|
def _get_errorHandler(self):
|
|
return self.errorHandler
|
|
def _set_errorHandler(self, errorHandler):
|
|
self.errorHandler = errorHandler
|
|
|
|
def _get_filter(self):
|
|
return self.filter
|
|
def _set_filter(self, filter):
|
|
self.filter = filter
|
|
|
|
def setFeature(self, name, state):
|
|
if self.supportsFeature(name):
|
|
state = state and 1 or 0
|
|
try:
|
|
settings = self._settings[(_name_xform(name), state)]
|
|
except KeyError:
|
|
raise xml.dom.NotSupportedErr(
|
|
"unsupported feature: " + `name`)
|
|
else:
|
|
for name, value in settings:
|
|
setattr(self._options, name, value)
|
|
else:
|
|
raise xml.dom.NotFoundErr("unknown feature: " + repr(name))
|
|
|
|
def supportsFeature(self, name):
|
|
return hasattr(self._options, _name_xform(name))
|
|
|
|
def canSetFeature(self, name, state):
|
|
key = (_name_xform(name), state and 1 or 0)
|
|
return self._settings.has_key(key)
|
|
|
|
# This dictionary maps from (feature,value) to a list of
|
|
# (option,value) pairs that should be set on the Options object.
|
|
# If a (feature,value) setting is not in this dictionary, it is
|
|
# not supported by the DOMBuilder.
|
|
#
|
|
_settings = {
|
|
("namespace_declarations", 0): [
|
|
("namespace_declarations", 0)],
|
|
("namespace_declarations", 1): [
|
|
("namespace_declarations", 1)],
|
|
("validation", 0): [
|
|
("validation", 0)],
|
|
("external_general_entities", 0): [
|
|
("external_general_entities", 0)],
|
|
("external_general_entities", 1): [
|
|
("external_general_entities", 1)],
|
|
("external_parameter_entities", 0): [
|
|
("external_parameter_entities", 0)],
|
|
("external_parameter_entities", 1): [
|
|
("external_parameter_entities", 1)],
|
|
("validate_if_schema", 0): [
|
|
("validate_if_schema", 0)],
|
|
("create_entity_ref_nodes", 0): [
|
|
("create_entity_ref_nodes", 0)],
|
|
("create_entity_ref_nodes", 1): [
|
|
("create_entity_ref_nodes", 1)],
|
|
("entities", 0): [
|
|
("create_entity_ref_nodes", 0),
|
|
("entities", 0)],
|
|
("entities", 1): [
|
|
("entities", 1)],
|
|
("whitespace_in_element_content", 0): [
|
|
("whitespace_in_element_content", 0)],
|
|
("whitespace_in_element_content", 1): [
|
|
("whitespace_in_element_content", 1)],
|
|
("cdata_sections", 0): [
|
|
("cdata_sections", 0)],
|
|
("cdata_sections", 1): [
|
|
("cdata_sections", 1)],
|
|
("comments", 0): [
|
|
("comments", 0)],
|
|
("comments", 1): [
|
|
("comments", 1)],
|
|
("charset_overrides_xml_encoding", 0): [
|
|
("charset_overrides_xml_encoding", 0)],
|
|
("charset_overrides_xml_encoding", 1): [
|
|
("charset_overrides_xml_encoding", 1)],
|
|
("infoset", 0): [],
|
|
("infoset", 1): [
|
|
("namespace_declarations", 0),
|
|
("validate_if_schema", 0),
|
|
("create_entity_ref_nodes", 0),
|
|
("entities", 0),
|
|
("cdata_sections", 0),
|
|
("datatype_normalization", 1),
|
|
("whitespace_in_element_content", 1),
|
|
("comments", 1),
|
|
("charset_overrides_xml_encoding", 1)],
|
|
("supported_mediatypes_only", 0): [
|
|
("supported_mediatypes_only", 0)],
|
|
("namespaces", 0): [
|
|
("namespaces", 0)],
|
|
("namespaces", 1): [
|
|
("namespaces", 1)],
|
|
}
|
|
|
|
def getFeature(self, name):
|
|
xname = _name_xform(name)
|
|
try:
|
|
return getattr(self._options, xname)
|
|
except AttributeError:
|
|
if name == "infoset":
|
|
options = self._options
|
|
return (options.datatype_normalization
|
|
and options.whitespace_in_element_content
|
|
and options.comments
|
|
and options.charset_overrides_xml_encoding
|
|
and not (options.namespace_declarations
|
|
or options.validate_if_schema
|
|
or options.create_entity_ref_nodes
|
|
or options.entities
|
|
or options.cdata_sections))
|
|
raise xml.dom.NotFoundErr("feature %s not known" % repr(name))
|
|
|
|
def parseURI(self, uri):
|
|
if self.entityResolver:
|
|
input = self.entityResolver.resolveEntity(None, uri)
|
|
else:
|
|
input = DOMEntityResolver().resolveEntity(None, uri)
|
|
return self.parse(input)
|
|
|
|
def parse(self, input):
|
|
options = copy.copy(self._options)
|
|
options.filter = self.filter
|
|
options.errorHandler = self.errorHandler
|
|
fp = input.byteStream
|
|
if fp is None and options.systemId:
|
|
import urllib2
|
|
fp = urllib2.urlopen(input.systemId)
|
|
return self._parse_bytestream(fp, options)
|
|
|
|
def parseWithContext(self, input, cnode, action):
|
|
if action not in self._legal_actions:
|
|
raise ValueError("not a legal action")
|
|
raise NotImplementedError("Haven't written this yet...")
|
|
|
|
def _parse_bytestream(self, stream, options):
|
|
import xml.dom.expatbuilder
|
|
builder = xml.dom.expatbuilder.makeBuilder(options)
|
|
return builder.parseFile(stream)
|
|
|
|
|
|
def _name_xform(name):
|
|
return name.lower().replace('-', '_')
|
|
|
|
|
|
class DOMEntityResolver(NewStyle):
|
|
__slots__ = '_opener',
|
|
|
|
def resolveEntity(self, publicId, systemId):
|
|
assert systemId is not None
|
|
source = DOMInputSource()
|
|
source.publicId = publicId
|
|
source.systemId = systemId
|
|
source.byteStream = self._get_opener().open(systemId)
|
|
|
|
# determine the encoding if the transport provided it
|
|
source.encoding = self._guess_media_encoding(source)
|
|
|
|
# determine the base URI is we can
|
|
import posixpath, urlparse
|
|
parts = urlparse.urlparse(systemId)
|
|
scheme, netloc, path, params, query, fragment = parts
|
|
# XXX should we check the scheme here as well?
|
|
if path and not path.endswith("/"):
|
|
path = posixpath.dirname(path) + "/"
|
|
parts = scheme, netloc, path, params, query, fragment
|
|
source.baseURI = urlparse.urlunparse(parts)
|
|
|
|
return source
|
|
|
|
def _get_opener(self):
|
|
try:
|
|
return self._opener
|
|
except AttributeError:
|
|
self._opener = self._create_opener()
|
|
return self._opener
|
|
|
|
def _create_opener(self):
|
|
import urllib2
|
|
return urllib2.build_opener()
|
|
|
|
def _guess_media_encoding(self, source):
|
|
info = source.byteStream.info()
|
|
if info.has_key("Content-Type"):
|
|
for param in info.getplist():
|
|
if param.startswith("charset="):
|
|
return param.split("=", 1)[1].lower()
|
|
|
|
|
|
class DOMInputSource(NewStyle):
|
|
__slots__ = ('byteStream', 'characterStream', 'stringData',
|
|
'encoding', 'publicId', 'systemId', 'baseURI')
|
|
|
|
def __init__(self):
|
|
self.byteStream = None
|
|
self.characterStream = None
|
|
self.stringData = None
|
|
self.encoding = None
|
|
self.publicId = None
|
|
self.systemId = None
|
|
self.baseURI = None
|
|
|
|
def _get_byteStream(self):
|
|
return self.byteStream
|
|
def _set_byteStream(self, byteStream):
|
|
self.byteStream = byteStream
|
|
|
|
def _get_characterStream(self):
|
|
return self.characterStream
|
|
def _set_characterStream(self, characterStream):
|
|
self.characterStream = characterStream
|
|
|
|
def _get_stringData(self):
|
|
return self.stringData
|
|
def _set_stringData(self, data):
|
|
self.stringData = data
|
|
|
|
def _get_encoding(self):
|
|
return self.encoding
|
|
def _set_encoding(self, encoding):
|
|
self.encoding = encoding
|
|
|
|
def _get_publicId(self):
|
|
return self.publicId
|
|
def _set_publicId(self, publicId):
|
|
self.publicId = publicId
|
|
|
|
def _get_systemId(self):
|
|
return self.systemId
|
|
def _set_systemId(self, systemId):
|
|
self.systemId = systemId
|
|
|
|
def _get_baseURI(self):
|
|
return self.baseURI
|
|
def _set_baseURI(self, uri):
|
|
self.baseURI = uri
|
|
|
|
|
|
class DOMBuilderFilter:
|
|
"""Element filter which can be used to tailor construction of
|
|
a DOM instance.
|
|
"""
|
|
|
|
# There's really no need for this class; concrete implementations
|
|
# should just implement the endElement() and startElement()
|
|
# methods as appropriate. Using this makes it easy to only
|
|
# implement one of them.
|
|
|
|
FILTER_ACCEPT = 1
|
|
FILTER_REJECT = 2
|
|
FILTER_SKIP = 3
|
|
FILTER_INTERRUPT = 4
|
|
|
|
whatToShow = NodeFilter.SHOW_ALL
|
|
|
|
def _get_whatToShow(self):
|
|
return self.whatToShow
|
|
|
|
def acceptNode(self, element):
|
|
return self.FILTER_ACCEPT
|
|
|
|
def startContainer(self, element):
|
|
return self.FILTER_ACCEPT
|
|
|
|
del NodeFilter
|
|
|
|
|
|
class DocumentLS:
|
|
"""Mixin to create documents that conform to the load/save spec."""
|
|
|
|
async = False
|
|
|
|
def _get_async(self):
|
|
return False
|
|
def _set_async(self, async):
|
|
if async:
|
|
raise xml.dom.NotSupportedErr(
|
|
"asynchronous document loading is not supported")
|
|
|
|
def abort(self):
|
|
# What does it mean to "clear" a document? Does the
|
|
# documentElement disappear?
|
|
raise NotImplementedError(
|
|
"haven't figured out what this means yet")
|
|
|
|
def load(self, uri):
|
|
raise NotImplementedError("haven't written this yet")
|
|
|
|
def loadXML(self, source):
|
|
raise NotImplementedError("haven't written this yet")
|
|
|
|
def saveXML(self, snode):
|
|
if snode is None:
|
|
snode = self
|
|
elif snode.ownerDocument is not self:
|
|
raise xml.dom.WrongDocumentErr()
|
|
return snode.toxml()
|
|
|
|
|
|
class DOMImplementationLS:
|
|
MODE_SYNCHRONOUS = 1
|
|
MODE_ASYNCHRONOUS = 2
|
|
|
|
def createDOMBuilder(self, mode, schemaType):
|
|
if schemaType is not None:
|
|
raise xml.dom.NotSupportedErr(
|
|
"schemaType not yet supported")
|
|
if mode == self.MODE_SYNCHRONOUS:
|
|
return DOMBuilder()
|
|
if mode == self.MODE_ASYNCHRONOUS:
|
|
raise xml.dom.NotSupportedErr(
|
|
"asynchronous builders are not supported")
|
|
raise ValueError("unknown value for mode")
|
|
|
|
def createDOMWriter(self):
|
|
raise NotImplementedError(
|
|
"the writer interface hasn't been written yet!")
|
|
|
|
def createDOMInputSource(self):
|
|
return DOMInputSource()
|