1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 """This module contains support for processing XML using a SAX parser.
16
17 In particular, it provides a L{base content handler class<BaseSAXHandler>}
18 that maintains namespace context and element state in a stack; and a L{base
19 element state class <SAXElementState>} which records the location of the
20 element in the stream. These classes are extended for specific parsing needs
21 (e.g., L{pyxb.binding.saxer}).
22 """
23
24 import xml.sax
25 import xml.sax.handler
26 import pyxb.namespace
27
29 """A SAX handler class which prints each method invocation.
30 """
31
32
33 __trace = False
34
36 print 'setDocumentLocator %s' % (locator,)
37
40
42 print 'startPrefixMapping %s %s' % (prefix, uri)
43
45 print 'endPrefixMapping %s' % (prefix,)
46
48 print 'startElementNS %s %s' % (name, qname)
49
51 print 'endElementNS %s %s' % (name, qname)
52
54 print 'characters %s' % (content,)
55
57 print 'ignorableWhitespace len %d' % (len(whitespace),)
58
60 print 'processingInstruction %s' % (data,)
61
63 """A SAX handler class which doesn't do anything. Used to get baseline
64 performance parsing a particular document.
65 """
66
69
72
75
78
81
84
87
90
93
94
96 """State corresponding to processing a given element with the SAX
97 model."""
98
100 """Reference to the SAXElementState of the element enclosing this
101 one."""
102 return self.__parentState
103 __parentState = None
104
105 - def namespaceContext (self):
106 """The L{pyxb.namespace.resolution.NamespaceContext} used for this
107 binding."""
108 return self.__namespaceContext
109 __namespaceContext = None
110
112 """The L{expanded name<pyxb.namespace.ExpandedName>} of the
113 element."""
114 return self.__expandedName
115 __expandedName = None
116
118 """The L{location<pyxb.utils.utility.Location>} corresponding to the
119 element event."""
120 return self.__location
121 __location = None
122
123 - def content (self):
124 """An accumulation of content to be supplied to the content model when
125 the element end is reached.
126
127 This is a list, with each member being C{(content, element_use,
128 maybe_element)}. C{content} is text or a binding instance;
129 C{element_use} is C{None} or the
130 L{ElementUse<pyxb.binding.content.ElementUse>} instance used to create
131 the content; and C{maybe_element} is C{True} iff the content is
132 non-content text."""
133 return self.__content
134 __content = None
135
142
143 - def addTextContent (self, content):
144 """Add the given text as non-element content of the current element.
145 @type content: C{unicode} or C{str}
146 @return: C{self}
147 """
148 self.__content.append( (content, None, False) )
149
150 - def addElementContent (self, element, element_use):
151 """Add the given binding instance as element content correspondidng to
152 the given use.
153
154 @param element: Any L{binding instance<pyxb.binding.basis._TypeBinding_mixin>}.
155
156 @param element_use: The L{element
157 use<pyxb.binding.content.ElementUse>} in the containing complex type.
158 """
159 self.__content.append( (element, element_use, True) )
160
162 """A SAX handler class that maintains a stack of enclosing elements and
163 manages namespace declarations.
164
165 This is the base for L{pyxb.utils.saxdom._DOMSAXHandler} and
166 L{pyxb.binding.saxer.PyXBSAXHandler}.
167 """
168
169
170
171 __locationTemplate = None
172
173
174
175
176 __elementStateConstructor = None
177
178
179
181 """Return the namespace used to resolve unqualified names with no default namespace."""
182 return self.__fallbackNamespace
183 __fallbackNamespace = None
184
185
186
187
188
189
190 __nextNamespaceContext = None
191
192
193 - def namespaceContext (self):
194 """Return the namespace context used for QName resolution within the
195 current element.
196
197 @return: An instance of L{pyxb.namespace.resolution.NamespaceContext}"""
198 return self.__namespaceContext
199 __namespaceContext = None
200
201
202
203
204
205 __includingContext = None
206
207
208
209 __locator = None
210
211
214 __elementState = None
215
216
217 __elementStateStack = []
218
220 """Return the binding object corresponding to the top-most
221 element in the document
222
223 @return: An instance of L{basis._TypeBinding_mixin} (most usually a
224 L{basis.complexTypeDefinition}."""
225 return self.__rootObject
226 __rootObject = None
227
245
247 """Create a new C{xml.sax.handler.ContentHandler} instance to maintain state relevant to elements.
248
249 @keyword fallback_namespace: Optional namespace to use for unqualified
250 names with no default namespace in scope. Has no effect unless it is
251 an absent namespace.
252
253 @keyword element_state_constructor: Optional callable object that
254 creates instances of L{SAXElementState} that hold element-specific
255 information. Defaults to L{SAXElementState}.
256
257 @keyword target_namespace: Optional namespace to set as the target
258 namespace. If not provided, there is no target namespace (not even an
259 absent one). This is the appropriate situation when processing plain
260 XML documents.
261
262 @keyword location_base: An object to be recorded as the base of all
263 L{pyxb.utils.utility.Location} instances associated with events and
264 objects handled by the parser.
265 """
266 self.__includingContext = kw.pop('including_context', None)
267 self.__fallbackNamespace = kw.pop('fallback_namespace', None)
268 self.__elementStateConstructor = kw.pop('element_state_constructor', SAXElementState)
269 self.__targetNamespace = kw.pop('target_namespace', None)
270 self.__locationTemplate = pyxb.utils.utility.Location(kw.pop('location_base', None))
271
272
273
279
281 """Save the locator object."""
282 self.__locator = locator
283
285 """Process the start of a document.
286
287 This resets this handler for a new document.
288 @note: setDocumentLocator is invoked before startDocument
289 """
290 self.reset()
291
298
299
300
301
302
303
304
333
346
347
348
349
350
351
352 __pendingText = None
357
361
365
368
369 import StringIO
371 """Dummy used to prevent the SAX parser from crashing when it sees
372 processing instructions that we don't care about."""
374 return StringIO.StringIO('')
375
376 _CreateParserModules = []
378 """Provide list of modules to be used when creating parsers.
379
380 C{xml.sax.make_parser()} takes as a parameter an optional list of modules
381 which allow customization of the parser to be used. Certain parsers have
382 better support for Unicode than others.
383
384 As an example, providing C{["drv_libxml2"]} causes the libxml2 parser to
385 be used.
386
387 The default behavior if this function is not called, or if it is called
388 with an empty list or C{None}, is to provide no specific modules, which
389 will result in the system default parser (probably expat).
390
391 @param create_parser_modules: an iterable list of names of modules that
392 provide a C{create_parser} function. Pass C{None} to reset to the system
393 default. """
394 global _CreateParserModules
395 if create_parser_modules is None:
396 _CreateParserModules = []
397 else:
398 _CreateParserModules = list(create_parser_modules)
399
401 """Extend C{xml.sax.make_parser} to configure the parser the way we
402 need it:
403
404 - C{feature_namespaces} is set to C{True} so we process xmlns
405 directives properly
406 - C{feature_namespace_prefixes} is set to C{False} so we don't get
407 prefixes encoded into our names (probably redundant with the above but
408 still...)
409
410 All keywords not documented here (and C{fallback_namespace}, which is) are
411 passed to the C{content_handler_constructor} if that must be invoked.
412
413 @keyword content_handler: The content handler instance for the
414 parser to use. If not provided, an instance of C{content_handler_constructor}
415 is created and used.
416 @type content_handler: C{xml.sax.handler.ContentHandler}
417
418 @keyword content_handler_constructor: A callable which produces an
419 appropriate instance of (a subclass of) L{BaseSAXHandler}. The default is
420 L{BaseSAXHandler}.
421
422 @keyword fallback_namespace: The namespace to use for lookups of
423 unqualified names in absent namespaces; see
424 L{pyxb.namespace.ExpandedName}. This keyword is not used by this
425 function, but is passed to the C{content_handler_constructor}.
426 @type fallback_namespace: L{pyxb.namespace.Namespace}
427 """
428 content_handler_constructor = kw.pop('content_handler_constructor', BaseSAXHandler)
429 content_handler = kw.pop('content_handler', None)
430 if content_handler is None:
431 content_handler = content_handler_constructor(**kw)
432 parser = xml.sax.make_parser(_CreateParserModules)
433 parser.setFeature(xml.sax.handler.feature_namespaces, True)
434 parser.setFeature(xml.sax.handler.feature_namespace_prefixes, False)
435 parser.setContentHandler(content_handler)
436
437 try:
438 parser.setEntityResolver(_EntityResolver())
439 except xml.sax.SAXNotSupportedException:
440 pass
441 return parser
442
443 if '__main__' == __name__:
444 import xml.dom.pulldom
445 import pyxb.utils.domutils as domutils
446 import pyxb.utils.saxdom as saxdom
447 import time
448 import lxml.sax
449 import lxml.etree
450 import StringIO
451 import sys
452
453 Handler = BaseSAXHandler
454 xml_file = 'examples/tmsxtvd/tmsdatadirect_sample.xml'
455 if 1 < len(sys.argv):
456 xml_file = sys.argv[1]
457 xmls = open(xml_file).read()
458
459 dt1 = time.time()
460 dt2 = time.time()
461 dom = xml.dom.minidom.parseString(xmls)
462 dt3 = time.time()
463
464 snt1 = time.time()
465 saxer = make_parser(content_handler=_NoopSAXHandler())
466 snt2 = time.time()
467 saxer.parse(StringIO.StringIO(xmls))
468 snt3 = time.time()
469
470 sbt1 = time.time()
471 saxer = make_parser(content_handler=BaseSAXHandler())
472 sbt2 = time.time()
473 saxer.parse(StringIO.StringIO(xmls))
474 sbt3 = time.time()
475
476 pdt1 = time.time()
477 sdomer = make_parser(content_handler_constructor=saxdom._DOMSAXHandler)
478 h = sdomer.getContentHandler()
479 pdt2 = time.time()
480 sdomer.parse(StringIO.StringIO(xmls))
481 pdt3 = time.time()
482
483 lst1 = time.time()
484 tree = lxml.etree.fromstring(xmls)
485 lst2 = time.time()
486 lsh = Handler()
487 lxml.sax.saxify(tree, lsh)
488 lst3 = time.time()
489
490 ldt1 = time.time()
491 tree = lxml.etree.fromstring(xmls)
492 ldt2 = time.time()
493 ldh = xml.dom.pulldom.SAX2DOM()
494 lxml.sax.saxify(tree, ldh)
495 ldt3 = time.time()
496
497 print 'minidom read %f, parse %f, total %f' % (dt2-dt1, dt3-dt2, dt3-dt1)
498 print 'SAX+noop create %f, parse %f, total %f' % (snt2-snt1, snt3-snt2, snt3-snt1)
499 print 'SAX+ns create %f, parse %f, total %f' % (sbt2-sbt1, sbt3-sbt2, sbt3-sbt1)
500 print 'PyXB SAXDOM-based create %f, parse %f, total %f' % (pdt2-pdt1, pdt3-pdt2, pdt3-pdt1)
501 print 'LXML+SAX tree %f, parse %f, total %f' % (lst2-lst1, lst3-lst2, lst3-lst1)
502 print 'LXML+pulldom DOM tree %f, parse %f, total %f' % (ldt2-ldt1, ldt3-ldt2, ldt3-ldt1)
503
504
505
506
507