# encoding: utf-8
import sys
JYTHON = sys.platform.startswith("java")
import doctest
import xml.parsers.expat as expat
from xml.etree.ElementTree import *
def jython(function):
if JYTHON:
return function
else:
return None
class sortdict(dict):
def __repr__(self):
items = self.items()
items.sort()
pairs = ["%r: %r" % pair for pair in items]
return "{%s}" % ", ".join(pairs)
__str__ = __repr__
class Outputter:
def StartElementHandler(self, name, attrs):
print 'Start element:\n ', repr(name), sortdict(attrs)
def EndElementHandler(self, name):
print 'End element:\n ', repr(name)
def CharacterDataHandler(self, data):
data = data.strip()
if data:
print 'Character data:'
print ' ', repr(data)
def ProcessingInstructionHandler(self, target, data):
print 'PI:\n ', repr(target), repr(data)
def StartNamespaceDeclHandler(self, prefix, uri):
print 'NS decl:\n ', repr(prefix), repr(uri)
def EndNamespaceDeclHandler(self, prefix):
print 'End of NS decl:\n ', repr(prefix)
def StartCdataSectionHandler(self):
print 'Start of CDATA section'
def EndCdataSectionHandler(self):
print 'End of CDATA section'
def CommentHandler(self, text):
print 'Comment:\n ', repr(text)
def NotationDeclHandler(self, *args):
name, base, sysid, pubid = args
print 'Notation declared:', args
def UnparsedEntityDeclHandler(self, *args):
entityName, base, systemId, publicId, notationName = args
print 'Unparsed entity decl:\n ', args
def NotStandaloneHandler(self, userData):
print 'Not standalone'
return 1
def ExternalEntityRefHandler(self, *args):
context, base, sysId, pubId = args
print 'External entity ref:', args[1:]
return 1
def DefaultHandler(self, userData):
pass
def DefaultHandlerExpand(self, userData):
pass
_= """
>>> data = '''\
...
...
...
...
...
...
...
...
... %unparsed_entity;
... ]>
...
...
...
... Contents of subelements
...
...
... &external_entity;
...
... '''
"""
def test_utf8():
"""
Source: test_pyexpat.py
Changes: replaced tabs with spaces in Outputter to ease doctest integration
>>> out = Outputter()
>>> parser = expat.ParserCreate(namespace_separator='!')
>>> HANDLER_NAMES = [
... 'StartElementHandler', 'EndElementHandler',
... 'CharacterDataHandler',
... 'ProcessingInstructionHandler',
... 'UnparsedEntityDeclHandler', 'NotationDeclHandler',
... 'StartNamespaceDeclHandler', 'EndNamespaceDeclHandler',
... 'CommentHandler', 'StartCdataSectionHandler',
... 'EndCdataSectionHandler',
... 'DefaultHandler', 'DefaultHandlerExpand',
... #'NotStandaloneHandler',
... 'ExternalEntityRefHandler'
... ]
>>> for name in HANDLER_NAMES:
... setattr(parser, name, getattr(out, name))
>>> data = '''\\
...
...
...
...
...
...
...
...
... %unparsed_entity;
... ]>
...
...
...
... Contents of subelements
...
...
... &external_entity;
...
... '''
#Produce UTF-8 output
#>>> parser.returns_unicode = 0
#>>> try:
#... parser.Parse(data, 1)
#... except expat.error:
#... print '** Error', parser.ErrorCode, expat.ErrorString(parser.ErrorCode)
#... print '** Line', parser.ErrorLineNumber
#... print '** Column', parser.ErrorColumnNumber
#... print '** Byte', parser.ErrorByteIndex
#PI:
#'xml-stylesheet' 'href="stylesheet.css"'
#Comment:
#' comment data '
#Notation declared: ('notation', None, 'notation.jpeg', None)
#Unparsed entity decl:
#('unparsed_entity', None, 'entity.file', None, 'notation')
#Start element:
#'root' {'attr1': 'value1', 'attr2': 'value2\\xe1\\xbd\\x80'}
#NS decl:
#'myns' 'http://www.python.org/namespace'
#Start element:
#'http://www.python.org/namespace!subelement' {}
#Character data:
#'Contents of subelements'
#End element:
#'http://www.python.org/namespace!subelement'
#End of NS decl:
#'myns'
#Start element:
#'sub2' {}
#Start of CDATA section
#Character data:
#'contents of CDATA section'
#End of CDATA section
#End element:
#'sub2'
#External entity ref: (None, 'entity.file', None)
#End element:
#'root'
#1
>>> parser = expat.ParserCreate(namespace_separator='!')
>>> parser.returns_unicode = 1
>>> for name in HANDLER_NAMES:
... setattr(parser, name, getattr(out, name))
>>> try:
... parser.Parse(data, 1)
... except expat.error:
... print '** Line', parser.ErrorLineNumber
... print '** Column', parser.ErrorColumnNumber
... print '** Byte', parser.ErrorByteIndex #doctest: +REPORT_UDIFF
PI:
u'xml-stylesheet' u'href="stylesheet.css"'
Comment:
u' comment data '
Notation declared: (u'notation', None, u'notation.jpeg', None)
Unparsed entity decl:
(u'unparsed_entity', None, u'entity.file', None, u'notation')
Start element:
u'root' {u'attr1': u'value1', u'attr2': u'value2\u1f40'}
NS decl:
u'myns' u'http://www.python.org/namespace'
Start element:
u'http://www.python.org/namespace!subelement' {}
Character data:
u'Contents of subelements'
End element:
u'http://www.python.org/namespace!subelement'
End of NS decl:
u'myns'
Start element:
u'sub2' {}
Start of CDATA section
Character data:
u'contents of CDATA section'
End of CDATA section
End element:
u'sub2'
External entity ref: (None, u'entity.file', None)
End element:
u'root'
1
"""
def test_import_as_pyexpat():
"""
>>> import pyexpat as expat
>>> expat #doctest: +ELLIPSIS
"""
def test_errors_submodule():
"""
>>> import xml.parsers.expat as expat
>>> expat.errors
>>> dir(expat.errors) #doctest: +ELLIPSIS
['XML_ERROR_ABORTED', ..., 'XML_ERROR_XML_DECL', '__doc__', '__name__']
>>> expat.errors.XML_ERROR_ABORTED
'parsing aborted'
>>> expat.errors.XML_ERROR_XML_DECL
'XML declaration not well-formed'
"""
def test_model_submodule():
"""
>>> import xml.parsers.expat as expat
>>> expat.model
>>> print sortdict(expat.model.__dict__)
{'XML_CQUANT_NONE': 0, 'XML_CQUANT_OPT': 1, 'XML_CQUANT_PLUS': 3, 'XML_CQUANT_REP': 2, 'XML_CTYPE_ANY': 2, 'XML_CTYPE_CHOICE': 5, 'XML_CTYPE_EMPTY': 1, 'XML_CTYPE_MIXED': 3, 'XML_CTYPE_NAME': 4, 'XML_CTYPE_SEQ': 6, '__doc__': 'Constants used to interpret content model information.', '__name__': 'pyexpat.model'}
"""
def test_parse_only_xml_data():
"""
Source: test_pyexpat.py, see also: http://python.org/sf/1296433
Changes:
- replaced 'iso8859' encoding with 'ISO-8859-1',
- added isfinal=True keyword argument to Parse call (as in this port,
the data is not processed until it is fully available).
With these changes, the test still crashes CPython 2.5.
>>> import xml.parsers.expat as expat
>>> # xml = "%s" % ('a' * 1025)
This one doesn't crash:
>>> xml = "%s" % ('a' * 10000)
>>> def handler(text):
... raise Exception
>>> parser = expat.ParserCreate()
>>> parser.CharacterDataHandler = handler
>>> try:
... parser.Parse(xml, True)
... except:
... pass
"""
def test_namespace_separator():
"""
Source: test_pyexpat.py
Tests that make sure we get errors when the namespace_separator value
is illegal, and that we don't for good values:
>>> from xml.parsers.expat import ParserCreate
>>> p = ParserCreate()
>>> p = ParserCreate(namespace_separator=None)
>>> p = ParserCreate(namespace_separator=' ')
>>> p = ParserCreate(namespace_separator=42) #doctest: +ELLIPSIS
Traceback (most recent call last):
...
TypeError: ...
>>> p = ParserCreate(namespace_separator='too long') #doctest: +ELLIPSIS
Traceback (most recent call last):
...
ValueError: ...
ParserCreate() needs to accept a namespace_separator of zero length
to satisfy the requirements of RDF applications that are required
to simply glue together the namespace URI and the localname. Though
considered a wart of the RDF specifications, it needs to be supported.
See XML-SIG mailing list thread starting with
http://mail.python.org/pipermail/xml-sig/2001-April/005202.html
>>> p = ParserCreate(namespace_separator='') # too short
"""
def test_interning_machinery():
"""
Source: test_pyexpat.py
>>> from xml.parsers.expat import ParserCreate
>>> p = ParserCreate()
>>> L = []
>>> def collector(name, *args):
... L.append(name)
>>> p.StartElementHandler = collector
>>> p.EndElementHandler = collector
>>> p.Parse(" ", 1)
1
>>> tag = L[0]
>>> len(L)
6
>>> all(tag is entry for entry in L)
True
"""
def test_exception_from_callback():
"""
Source: test_pyexpat.py
>>> from xml.parsers.expat import ParserCreate
>>> def StartElementHandler(name, attrs):
... raise RuntimeError(name)
>>> parser = ParserCreate()
>>> parser.StartElementHandler = StartElementHandler
>>> try:
... parser.Parse("", 1)
... except RuntimeError, e:
... pass
>>> e.args[0] == "a"
True
"""
def test_with_and_without_namespace():
"""
>>> from xml.parsers.expat import ParserCreate
>>> xml = '''
...
...
... '''
>>> def handler(name, attributes):
... attributes = sorted(attributes.items())
... print name
... for attr in attributes:
... print " %s = %r" % attr
>>> parser = ParserCreate()
>>> parser.StartElementHandler = handler
>>> _ = parser.Parse(xml, True)
root
b = u'2'
python:a = u'1'
xmlns = u'http://www.python.org'
xmlns:python = u'http://www.python.org'
python:sub1
sub2
xmlns = u''
>>> parser = ParserCreate(namespace_separator="|")
>>> parser.StartElementHandler = handler
>>> _ = parser.Parse(xml, True)
http://www.python.org|root
b = u'2'
http://www.python.org|a = u'1'
http://www.python.org|sub1
sub2
"""
def test_unicode_bug():
"""
Regression introduced by revision 28
>>> doc = XML("舰")
>>> doc.text
u'\u8230'
"""
def test_DTD():
"""
>>> xml = '''
...
...
...
...
...
...
...
...
...
... ]>
... content
... '''
>>> parser = expat.ParserCreate()
>>> def handler(header, *args):
... def _handler(*args):
... print header + ":", args
... return _handler
>>> parser.ElementDeclHandler = handler("ELEMENT")
>>> parser.AttlistDeclHandler = handler("ATTRIBUTE")
>>> parser.EntityDeclHandler = handler("ENTITY")
>>> parser.NotationDeclHandler = handler("NOTATION")
>>> parser.UnparsedEntityDeclHandler = handler("UNPARSED")
>>> parser.Parse(xml, True)
ELEMENT: (u'doc', (5, 0, None, ((4, 0, u'any', ()), (4, 0, u'empty', ()), (4, 0, u'text', ()), (4, 0, u'mixed', ()), (4, 0, u'opt', ()), (4, 0, u'many', ()), (4, 0, u'plus', ()))))
ELEMENT: (u'any', (2, 0, None, ()))
ELEMENT: (u'empty', (1, 0, None, ()))
ELEMENT: (u'text', (3, 0, None, ()))
ELEMENT: (u'sequence', (6, 0, None, ((4, 0, u'_sequence', ()),)))
ELEMENT: (u'_sequence', (6, 0, None, ((4, 0, u'any', ()), (4, 0, u'any', ()))))
ELEMENT: (u'mixed', (3, 2, None, ((4, 0, u'any', ()),)))
ELEMENT: (u'opt', (6, 1, None, ((4, 0, u'empty', ()),)))
ELEMENT: (u'many', (6, 2, None, ((4, 0, u'empty', ()),)))
ELEMENT: (u'plus', (6, 3, None, ((4, 0, u'empty', ()),)))
1
"""
def test_entity():
"""
TODO: need a fallback for entity-resolver so that empty source is returned.
>>> xml = '''
... ]>
... &ext-entity;&in-ext-dtd-entity;'''
>>> parser = expat.ParserCreate()
>>> parser.Parse(xml, True)
1
EXPAT OH MY ! When applicable (internal entities), the CharacterDataHandler
callback will override DefaultHandlerExpand, but it WON'T override
DefaultHandler. On the other hand, the DefaultHandlerExpand callback WILL
override DefaultHandler ... More tests todo here ...
>>> xml = '''
...
... ]>
... &int-entity;&ext-entity;&in-ext-dtd-entity;'''
>>> parser = expat.ParserCreate()
>>> def handler(header):
... def _handler(*args):
... print header + ":", args
... return 1
... return _handler
>>> parser.CharacterDataHandler = handler("text")
>>> parser.DefaultHandler = handler("default")
>>> parser.Parse(xml, True) #doctest: +ELLIPSIS
default: ...
default: (u'&int-entity;',)
default: (u'&ext-entity;',)
default: (u'&in-ext-dtd-entity;',)
...
1
EXPAT OH MY ! When applicable (internal entities), the CharacterDataHandler
callback will override DefaultHandlerExpand, but it WON'T override
DefaultHandler. On the other hand, the DefaultHandlerExpand callback WILL
override DefaultHandler ... More tests todo here ...
"""
def test_resolve_entity_handlers():
"""
>>> xml = '''
... ]>
... &entity;'''
>>> def handler(header):
... def _handler(*args):
... print header + ":", args
... return 1
... return _handler
>>> parser = expat.ParserCreate()
>>> parser.ExternalEntityRefHandler = handler("ExternalEntityRefHandler")
>>> parser.Parse(xml, True)
ExternalEntityRefHandler: (u'entity', None, u'entity', None)
1
"""
def handler(name, header="XML>", returns=None):
def _handler(*args):
if len(args) == 1:
args = "(%r)" % args[0]
else:
args = str(args)
print header, name + "%s" % args
return returns
return _handler
def parse(xml, *handlers):
parser = expat.ParserCreate()
for name in handlers:
if name == "ExternalEntityRefHandler":
returns = 1
else:
returns = None
setattr(parser, name, handler(name, returns=returns))
parser.Parse(xml, True)
def test_internal_entities():
"""
>>> xml = '''
... ]>
... &entity;'''
>>> parse(xml)
>>> parse(xml, "CharacterDataHandler")
XML> CharacterDataHandler(u'entity-content')
>>> parse(xml, "DefaultHandler") #doctest: +ELLIPSIS
XML> ...DefaultHandler(u'&entity;')...
>>> parse(xml, "DefaultHandlerExpand") #doctest: +ELLIPSIS
XML> ...DefaultHandlerExpand(u'entity-content')...
# Uhu ?
>>> parse(xml, "CharacterDataHandler",
... "DefaultHandler") #doctest: +ELLIPSIS
XML> ...DefaultHandler(u'&entity;')...
>>> parse(xml, "CharacterDataHandler",
... "DefaultHandlerExpand") #doctest: +ELLIPSIS
XML> ...CharacterDataHandler(u'entity-content')...
>>> parse(xml, "DefaultHandler",
... "DefaultHandlerExpand") #doctest: +ELLIPSIS
XML> ...DefaultHandlerExpand(u'entity-content')...
>>> parse(xml, "CharacterDataHandler",
... "DefaultHandler",
... "DefaultHandlerExpand") #doctest: +ELLIPSIS
XML> ...CharacterDataHandler(u'entity-content')...
"""
def test_external_entities():
"""
>>> xml = '''
... ]>
... &entity;'''
>>> parse(xml)
>>> parse(xml, "ExternalEntityRefHandler")
XML> ExternalEntityRefHandler(u'entity', None, u'entity-file', u'http://entity-web')
>>> parse(xml, "DefaultHandler") #doctest: +ELLIPSIS
XML> ...DefaultHandler(u'&entity;')...
>>> parse(xml, "DefaultHandlerExpand") #doctest: +ELLIPSIS
XML> ...DefaultHandlerExpand(u'&entity;')...
>>> parse(xml, "ExternalEntityRefHandler",
... "DefaultHandler") #doctest: +ELLIPSIS
XML> ...ExternalEntityRefHandler(u'entity', None, u'entity-file', u'http://entity-web')...
>>> parse(xml, "ExternalEntityRefHandler",
... "DefaultHandlerExpand") #doctest: +ELLIPSIS
XML> ...ExternalEntityRefHandler(u'entity', None, u'entity-file', u'http://entity-web')...
>>> parse(xml, "DefaultHandler",
... "DefaultHandlerExpand") #doctest: +ELLIPSIS
XML> ...DefaultHandlerExpand(u'&entity;')...
>>> parse(xml, "ExternalEntityRefHandler",
... "DefaultHandler",
... "DefaultHandlerExpand") #doctest: +ELLIPSIS
XML> ...ExternalEntityRefHandler(u'entity', None, u'entity-file', u'http://entity-web')...
"""
def test_undefined_entities():
"""
>>> xml = "&entity;"
>>> parse(xml)
Traceback (most recent call last):
...
ExpatError: undefined entity: line 1, column 5
"""
def locate(parser, name):
def _handler(*args):
print name, parser.CurrentLineNumber, parser.CurrentColumnNumber
return _handler
def test_current_location():
"""
>>> xml = '''texttext
...
... text
... '''
>>> parser = expat.ParserCreate()
>>> parser.CharacterDataHandler = locate(parser, "TEXT:")
>>> parser.StartElementHandler = locate(parser, "START:")
>>> parser.EndElementHandler = locate(parser, "END:")
>>> _ = parser.Parse(xml, True) #doctest: +ELLIPSIS
START: 1 0
TEXT: 1 5...
START: 1 9
END: 1 15
TEXT: 1 15...
START: 1 19
END: 1 24
TEXT: 1 30...
START: 2 0
END: 2 5
TEXT: 2 11...
START: 3 4
END: 3 10
TEXT: 3 10...
END: 4 0
>>> xml = '''
... start tag after some text
...
...
... '''
>>> parser = expat.ParserCreate()
>>> parser.CharacterDataHandler = locate(parser, "TEXT:")
>>> parser.StartElementHandler = locate(parser, "START:")
>>> parser.EndElementHandler = locate(parser, "END:")
>>> _ = parser.Parse(xml, True) #doctest: +ELLIPSIS
START: 1 0
TEXT: 1 5...
START: 2 25
END: 2 31
TEXT: 2 31...
START: 3 0
END: 3 5
START: 3 11
END: 3 17
TEXT: 3 17...
START: 4 0
END: 4 6
START: 4 6
END: 4 12
TEXT: 4 12...
END: 5 0
"""
def test_error_location():
"""
Source: selftest.py, ElementTree 1.3a3
Changes: removed dependencies in ElementTree, added one extra test
>>> def error(xml):
... p = expat.ParserCreate()
... try:
... p.Parse(xml, True)
... except expat.ExpatError, e:
... return e.lineno, e.offset
>>> error("foo")
(1, 0)
>>> error("&foo;")
(1, 5)
>>> error("foobar<")
(1, 6)
>>> error("text>> # Jython
>>> from org.python.core.util import StringUtil
>>> from jarray import array
>>> # Java Standard Edition
>>> from org.xml.sax import *
>>> from org.xml.sax.ext import *
>>> from org.xml.sax.helpers import *
>>> from java.io import ByteArrayInputStream
>>> xml = '''
... ]>
... &entity;
... '''
>>> def empty_source():
... _source = InputSource()
... byte_stream = ByteArrayInputStream(array([], "b"))
... _source.setByteStream(byte_stream)
... return _source
>>> class Handler(EntityResolver2):
... def getExternalSubset(self, name, baseURI):
... return None
... def resolveEntity(self, name, publicId, baseURI, systemId):
... print "Entity name:", name
... return empty_source()
>>> def main():
... sax_parser = "org.apache.xerces.parsers.SAXParser"
... reader = XMLReaderFactory.createXMLReader(sax_parser)
... entity_resolver2 = "http://xml.org/sax/features/use-entity-resolver2"
... enabled = reader.getFeature(entity_resolver2)
... print "Entity-Resolver2 enabled:", enabled
... handler = Handler()
... reader.setEntityResolver(handler)
... bytes = StringUtil.toBytes(xml)
... byte_stream = ByteArrayInputStream(bytes)
... source = InputSource(byte_stream)
... reader.parse(source)
>>> main()
Entity-Resolver2 enabled: True
Entity name: entity
"""
if __name__ == "__main__":
doctest.testmod()