726d90d871
git-svn-id: http://google-refine.googlecode.com/svn/branches/split-refactor@908 7d457c2a-affb-35e4-300a-418c747d4874
353 lines
12 KiB
Python
353 lines
12 KiB
Python
import xml.sax
|
|
import xml.sax.handler
|
|
import types
|
|
|
|
try:
|
|
_StringTypes = [types.StringType, types.UnicodeType]
|
|
except AttributeError:
|
|
_StringTypes = [types.StringType]
|
|
|
|
START_ELEMENT = "START_ELEMENT"
|
|
END_ELEMENT = "END_ELEMENT"
|
|
COMMENT = "COMMENT"
|
|
START_DOCUMENT = "START_DOCUMENT"
|
|
END_DOCUMENT = "END_DOCUMENT"
|
|
PROCESSING_INSTRUCTION = "PROCESSING_INSTRUCTION"
|
|
IGNORABLE_WHITESPACE = "IGNORABLE_WHITESPACE"
|
|
CHARACTERS = "CHARACTERS"
|
|
|
|
class PullDOM(xml.sax.ContentHandler):
|
|
_locator = None
|
|
document = None
|
|
|
|
def __init__(self, documentFactory=None):
|
|
from xml.dom import XML_NAMESPACE
|
|
self.documentFactory = documentFactory
|
|
self.firstEvent = [None, None]
|
|
self.lastEvent = self.firstEvent
|
|
self.elementStack = []
|
|
self.push = self.elementStack.append
|
|
try:
|
|
self.pop = self.elementStack.pop
|
|
except AttributeError:
|
|
# use class' pop instead
|
|
pass
|
|
self._ns_contexts = [{XML_NAMESPACE:'xml'}] # contains uri -> prefix dicts
|
|
self._current_context = self._ns_contexts[-1]
|
|
self.pending_events = []
|
|
|
|
def pop(self):
|
|
result = self.elementStack[-1]
|
|
del self.elementStack[-1]
|
|
return result
|
|
|
|
def setDocumentLocator(self, locator):
|
|
self._locator = locator
|
|
|
|
def startPrefixMapping(self, prefix, uri):
|
|
if not hasattr(self, '_xmlns_attrs'):
|
|
self._xmlns_attrs = []
|
|
self._xmlns_attrs.append((prefix or 'xmlns', uri))
|
|
self._ns_contexts.append(self._current_context.copy())
|
|
self._current_context[uri] = prefix or None
|
|
|
|
def endPrefixMapping(self, prefix):
|
|
self._current_context = self._ns_contexts.pop()
|
|
|
|
def startElementNS(self, name, tagName , attrs):
|
|
uri, localname = name
|
|
if uri:
|
|
# When using namespaces, the reader may or may not
|
|
# provide us with the original name. If not, create
|
|
# *a* valid tagName from the current context.
|
|
if tagName is None:
|
|
prefix = self._current_context[uri]
|
|
if prefix:
|
|
tagName = prefix + ":" + localname
|
|
else:
|
|
tagName = localname
|
|
if self.document:
|
|
node = self.document.createElementNS(uri, tagName)
|
|
else:
|
|
node = self.buildDocument(uri, tagName)
|
|
else:
|
|
# When the tagname is not prefixed, it just appears as
|
|
# localname
|
|
if self.document:
|
|
node = self.document.createElement(localname)
|
|
else:
|
|
node = self.buildDocument(None, localname)
|
|
|
|
# Retrieve xml namespace declaration attributes.
|
|
xmlns_uri = 'http://www.w3.org/2000/xmlns/'
|
|
xmlns_attrs = getattr(self, '_xmlns_attrs', None)
|
|
if xmlns_attrs is not None:
|
|
for aname, value in xmlns_attrs:
|
|
if aname == 'xmlns':
|
|
qname = aname
|
|
else:
|
|
qname = 'xmlns:' + aname
|
|
attr = self.document.createAttributeNS(xmlns_uri, qname)
|
|
attr.value = value
|
|
node.setAttributeNodeNS(attr)
|
|
self._xmlns_attrs = []
|
|
for aname,value in attrs.items():
|
|
a_uri, a_localname = aname
|
|
if a_uri:
|
|
prefix = self._current_context[a_uri]
|
|
if prefix:
|
|
qname = prefix + ":" + a_localname
|
|
else:
|
|
qname = a_localname
|
|
attr = self.document.createAttributeNS(a_uri, qname)
|
|
node.setAttributeNodeNS(attr)
|
|
else:
|
|
attr = self.document.createAttribute(a_localname)
|
|
node.setAttributeNode(attr)
|
|
attr.value = value
|
|
|
|
self.lastEvent[1] = [(START_ELEMENT, node), None]
|
|
self.lastEvent = self.lastEvent[1]
|
|
self.push(node)
|
|
|
|
def endElementNS(self, name, tagName):
|
|
self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
|
|
self.lastEvent = self.lastEvent[1]
|
|
|
|
def startElement(self, name, attrs):
|
|
if self.document:
|
|
node = self.document.createElement(name)
|
|
else:
|
|
node = self.buildDocument(None, name)
|
|
|
|
for aname,value in attrs.items():
|
|
attr = self.document.createAttribute(aname)
|
|
attr.value = value
|
|
node.setAttributeNode(attr)
|
|
|
|
self.lastEvent[1] = [(START_ELEMENT, node), None]
|
|
self.lastEvent = self.lastEvent[1]
|
|
self.push(node)
|
|
|
|
def endElement(self, name):
|
|
self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
|
|
self.lastEvent = self.lastEvent[1]
|
|
|
|
def comment(self, s):
|
|
if self.document:
|
|
node = self.document.createComment(s)
|
|
self.lastEvent[1] = [(COMMENT, node), None]
|
|
self.lastEvent = self.lastEvent[1]
|
|
else:
|
|
event = [(COMMENT, s), None]
|
|
self.pending_events.append(event)
|
|
|
|
def processingInstruction(self, target, data):
|
|
if self.document:
|
|
node = self.document.createProcessingInstruction(target, data)
|
|
self.lastEvent[1] = [(PROCESSING_INSTRUCTION, node), None]
|
|
self.lastEvent = self.lastEvent[1]
|
|
else:
|
|
event = [(PROCESSING_INSTRUCTION, target, data), None]
|
|
self.pending_events.append(event)
|
|
|
|
def ignorableWhitespace(self, chars):
|
|
node = self.document.createTextNode(chars)
|
|
self.lastEvent[1] = [(IGNORABLE_WHITESPACE, node), None]
|
|
self.lastEvent = self.lastEvent[1]
|
|
|
|
def characters(self, chars):
|
|
node = self.document.createTextNode(chars)
|
|
self.lastEvent[1] = [(CHARACTERS, node), None]
|
|
self.lastEvent = self.lastEvent[1]
|
|
|
|
def startDocument(self):
|
|
if self.documentFactory is None:
|
|
import xml.dom.minidom
|
|
self.documentFactory = xml.dom.minidom.Document.implementation
|
|
|
|
def buildDocument(self, uri, tagname):
|
|
# Can't do that in startDocument, since we need the tagname
|
|
# XXX: obtain DocumentType
|
|
node = self.documentFactory.createDocument(uri, tagname, None)
|
|
self.document = node
|
|
self.lastEvent[1] = [(START_DOCUMENT, node), None]
|
|
self.lastEvent = self.lastEvent[1]
|
|
self.push(node)
|
|
# Put everything we have seen so far into the document
|
|
for e in self.pending_events:
|
|
if e[0][0] == PROCESSING_INSTRUCTION:
|
|
_,target,data = e[0]
|
|
n = self.document.createProcessingInstruction(target, data)
|
|
e[0] = (PROCESSING_INSTRUCTION, n)
|
|
elif e[0][0] == COMMENT:
|
|
n = self.document.createComment(e[0][1])
|
|
e[0] = (COMMENT, n)
|
|
else:
|
|
raise AssertionError("Unknown pending event ",e[0][0])
|
|
self.lastEvent[1] = e
|
|
self.lastEvent = e
|
|
self.pending_events = None
|
|
return node.firstChild
|
|
|
|
def endDocument(self):
|
|
self.lastEvent[1] = [(END_DOCUMENT, self.document), None]
|
|
self.pop()
|
|
|
|
def clear(self):
|
|
"clear(): Explicitly release parsing structures"
|
|
self.document = None
|
|
|
|
class ErrorHandler:
|
|
def warning(self, exception):
|
|
print exception
|
|
def error(self, exception):
|
|
raise exception
|
|
def fatalError(self, exception):
|
|
raise exception
|
|
|
|
class DOMEventStream:
|
|
def __init__(self, stream, parser, bufsize):
|
|
self.stream = stream
|
|
self.parser = parser
|
|
self.bufsize = bufsize
|
|
if not hasattr(self.parser, 'feed'):
|
|
self.getEvent = self._slurp
|
|
self.reset()
|
|
|
|
def reset(self):
|
|
self.pulldom = PullDOM()
|
|
# This content handler relies on namespace support
|
|
self.parser.setFeature(xml.sax.handler.feature_namespaces, 1)
|
|
self.parser.setContentHandler(self.pulldom)
|
|
|
|
def __getitem__(self, pos):
|
|
rc = self.getEvent()
|
|
if rc:
|
|
return rc
|
|
raise IndexError
|
|
|
|
def next(self):
|
|
rc = self.getEvent()
|
|
if rc:
|
|
return rc
|
|
raise StopIteration
|
|
|
|
def __iter__(self):
|
|
return self
|
|
|
|
def expandNode(self, node):
|
|
event = self.getEvent()
|
|
parents = [node]
|
|
while event:
|
|
token, cur_node = event
|
|
if cur_node is node:
|
|
return
|
|
if token != END_ELEMENT:
|
|
parents[-1].appendChild(cur_node)
|
|
if token == START_ELEMENT:
|
|
parents.append(cur_node)
|
|
elif token == END_ELEMENT:
|
|
del parents[-1]
|
|
event = self.getEvent()
|
|
|
|
def getEvent(self):
|
|
# use IncrementalParser interface, so we get the desired
|
|
# pull effect
|
|
if not self.pulldom.firstEvent[1]:
|
|
self.pulldom.lastEvent = self.pulldom.firstEvent
|
|
while not self.pulldom.firstEvent[1]:
|
|
buf = self.stream.read(self.bufsize)
|
|
if not buf:
|
|
self.parser.close()
|
|
return None
|
|
self.parser.feed(buf)
|
|
rc = self.pulldom.firstEvent[1][0]
|
|
self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
|
|
return rc
|
|
|
|
def _slurp(self):
|
|
""" Fallback replacement for getEvent() using the
|
|
standard SAX2 interface, which means we slurp the
|
|
SAX events into memory (no performance gain, but
|
|
we are compatible to all SAX parsers).
|
|
"""
|
|
self.parser.parse(self.stream)
|
|
self.getEvent = self._emit
|
|
return self._emit()
|
|
|
|
def _emit(self):
|
|
""" Fallback replacement for getEvent() that emits
|
|
the events that _slurp() read previously.
|
|
"""
|
|
if self.pulldom.firstEvent[1] is None:
|
|
return None
|
|
rc = self.pulldom.firstEvent[1][0]
|
|
self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
|
|
return rc
|
|
|
|
def clear(self):
|
|
"""clear(): Explicitly release parsing objects"""
|
|
self.pulldom.clear()
|
|
del self.pulldom
|
|
self.parser = None
|
|
self.stream = None
|
|
|
|
class SAX2DOM(PullDOM):
|
|
|
|
def startElementNS(self, name, tagName , attrs):
|
|
PullDOM.startElementNS(self, name, tagName, attrs)
|
|
curNode = self.elementStack[-1]
|
|
parentNode = self.elementStack[-2]
|
|
parentNode.appendChild(curNode)
|
|
|
|
def startElement(self, name, attrs):
|
|
PullDOM.startElement(self, name, attrs)
|
|
curNode = self.elementStack[-1]
|
|
parentNode = self.elementStack[-2]
|
|
parentNode.appendChild(curNode)
|
|
|
|
def processingInstruction(self, target, data):
|
|
PullDOM.processingInstruction(self, target, data)
|
|
node = self.lastEvent[0][1]
|
|
parentNode = self.elementStack[-1]
|
|
parentNode.appendChild(node)
|
|
|
|
def ignorableWhitespace(self, chars):
|
|
PullDOM.ignorableWhitespace(self, chars)
|
|
node = self.lastEvent[0][1]
|
|
parentNode = self.elementStack[-1]
|
|
parentNode.appendChild(node)
|
|
|
|
def characters(self, chars):
|
|
PullDOM.characters(self, chars)
|
|
node = self.lastEvent[0][1]
|
|
parentNode = self.elementStack[-1]
|
|
parentNode.appendChild(node)
|
|
|
|
|
|
default_bufsize = (2 ** 14) - 20
|
|
|
|
def parse(stream_or_string, parser=None, bufsize=None):
|
|
if bufsize is None:
|
|
bufsize = default_bufsize
|
|
if type(stream_or_string) in _StringTypes:
|
|
stream = open(stream_or_string)
|
|
else:
|
|
stream = stream_or_string
|
|
if not parser:
|
|
parser = xml.sax.make_parser()
|
|
return DOMEventStream(stream, parser, bufsize)
|
|
|
|
def parseString(string, parser=None):
|
|
try:
|
|
from cStringIO import StringIO
|
|
except ImportError:
|
|
from StringIO import StringIO
|
|
|
|
bufsize = len(string)
|
|
buf = StringIO(string)
|
|
if not parser:
|
|
parser = xml.sax.make_parser()
|
|
return DOMEventStream(buf, parser, bufsize)
|