1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16 """This module contains support for processing XML using a SAX parser.
17
18 In particular, it provides a L{base content handler class<BaseSAXHandler>}
19 that maintains namespace context and element state in a stack; and a L{base
20 element state class <SAXElementState>} which records the location of the
21 element in the stream. These classes are extended for specific parsing needs
22 (e.g., L{pyxb.binding.saxer}).
23 """
24
25 from __future__ import print_function
26 import xml.sax
27 import xml.sax.handler
28 import pyxb.namespace
29 import io
30 import logging
31
32 _log = logging.getLogger(__name__)
33
35 """A SAX handler class which prints each method invocation.
36 """
37
38
39 __trace = False
40
42 print('setDocumentLocator %s' % (locator,))
43
45 print('startDocument')
46
48 print('startPrefixMapping %s %s' % (prefix, uri))
49
51 print('endPrefixMapping %s' % (prefix,))
52
54 print('startElementNS %s %s' % (name, qname))
55
57 print('endElementNS %s %s' % (name, qname))
58
60 print('characters %s' % (content,))
61
63 print('ignorableWhitespace len %d' % (len(whitespace),))
64
66 print('processingInstruction %s %s' % (target, data))
67
69 """A SAX handler class which doesn't do anything. Used to get baseline
70 performance parsing a particular document.
71 """
72
75
78
81
84
87
90
93
96
99
124
126 """State corresponding to processing a given element with the SAX
127 model."""
128
129 - def contentHandler (self):
130 """Reference to the C{xml.sxa.handler.ContentHandler} that is processing the document."""
131 return self.__contentHandler
132 __contentHandler = None
133
135 """Reference to the SAXElementState of the element enclosing this
136 one."""
137 return self.__parentState
138 __parentState = None
139
140 - def namespaceContext (self):
141 """The L{pyxb.namespace.resolution.NamespaceContext} used for this
142 binding."""
143 return self.__namespaceContext
144 __namespaceContext = None
145
147 """The L{expanded name<pyxb.namespace.ExpandedName>} of the
148 element."""
149 return self.__expandedName
150 __expandedName = None
151
153 """The L{location<pyxb.utils.utility.Location>} corresponding to the
154 element event."""
155 return self.__location
156 __location = None
157
158 - def content (self):
159 """An accumulation of content to be supplied to the content model when
160 the element end is reached.
161
162 This is a list, with each member being C{(content, element_use,
163 maybe_element)}. C{content} is text or a binding instance;
164 C{element_use} is C{None} or the
165 L{ElementDeclaration<pyxb.binding.content.ElementDeclaration>} instance used to create
166 the content; and C{maybe_element} is C{True} iff the content is
167 non-content text."""
168 return self.__content
169 __content = None
170
179
180 - def addTextContent (self, location, content):
181 """Add the given text as non-element content of the current element.
182 @type content: C{unicode} or C{str}
183 @return: C{self}
184 """
185 self.__content.append(SAXInformationItem(location, content, False))
186
187 - def addElementContent (self, location, element, element_decl=None):
188 """Add the given binding instance as element content corresponding to
189 the given use.
190
191 @param element: Any L{binding instance<pyxb.binding.basis._TypeBinding_mixin>}.
192
193 @param element_decl: The L{element
194 use<pyxb.binding.content.ElementDeclaration>} in the containing complex type.
195 """
196 self.__content.append(SAXInformationItem(location, element, True, element_decl))
197
199 """A SAX handler class that maintains a stack of enclosing elements and
200 manages namespace declarations.
201
202 This is the base for L{pyxb.utils.saxdom._DOMSAXHandler} and
203 L{pyxb.binding.saxer.PyXBSAXHandler}.
204 """
205
206
207
208 __locationTemplate = None
209
213
214
215
216
217 __elementStateConstructor = None
218
219
220
222 """Return the namespace used to resolve unqualified names with no default namespace."""
223 return self.__fallbackNamespace
224 __fallbackNamespace = None
225
226
227
228
229
230
231
232
233
234
235
236 __nextNamespaceContext = None
237
238
239 - def namespaceContext (self):
240 """Return the namespace context used for QName resolution within the
241 current element.
242
243 @return: An instance of L{pyxb.namespace.resolution.NamespaceContext}"""
244 return self.__namespaceContext
245 __namespaceContext = None
246
247
248
249
250
251 __includingContext = None
252
253
254
255 __locator = None
256
257
260 __elementState = None
261
262
263 __elementStateStack = []
264
266 """Return the binding object corresponding to the top-most
267 element in the document
268
269 @return: An instance of L{basis._TypeBinding_mixin} (most usually a
270 L{basis.complexTypeDefinition}."""
271 return self.__rootObject
272 __rootObject = None
273
292
294 """Create a new C{xml.sax.handler.ContentHandler} instance to maintain state relevant to elements.
295
296 @keyword fallback_namespace: Optional namespace to use for unqualified
297 names with no default namespace in scope. Has no effect unless it is
298 an absent namespace.
299
300 @keyword element_state_constructor: Optional callable object that
301 creates instances of L{SAXElementState} that hold element-specific
302 information. Defaults to L{SAXElementState}.
303
304 @keyword target_namespace: Optional namespace to set as the target
305 namespace. If not provided, there is no target namespace (not even an
306 absent one). This is the appropriate situation when processing plain
307 XML documents.
308
309 @keyword location_base: An object to be recorded as the base of all
310 L{pyxb.utils.utility.Location} instances associated with events and
311 objects handled by the parser.
312 """
313 self.__includingContext = kw.pop('including_context', None)
314 self.__fallbackNamespace = kw.pop('fallback_namespace', None)
315 self.__elementStateConstructor = kw.pop('element_state_constructor', SAXElementState)
316 self.__targetNamespace = kw.pop('target_namespace', None)
317 self.__locationTemplate = pyxb.utils.utility.Location(kw.pop('location_base', None))
318
320 """Save the locator object."""
321 self.__locator = locator
322
324 """Process the start of a document.
325
326 This resets this handler for a new document.
327 @note: setDocumentLocator is invoked before startDocument
328 """
329 self.reset()
330
332 ns_ctx = self.__nextNamespaceContext
333 if ns_ctx is None:
334 assert self.__namespaceContext is not None
335 ns_ctx = pyxb.namespace.resolution.NamespaceContext(parent_context=self.__namespaceContext)
336 self.__nextNamespaceContext = ns_ctx
337 return ns_ctx
338
345
346
347
348
349
350
392
406
407
408
409
410
411
412 __pendingText = None
413 __pendingTextLocation = None
422
428
432
435
437 """Dummy used to prevent the SAX parser from crashing when it sees
438 processing instructions that we don't care about."""
440 return io.StringIO(u'')
441
442 _CreateParserModules = []
444 """Provide list of modules to be used when creating parsers.
445
446 C{xml.sax.make_parser()} takes as a parameter an optional list of modules
447 which allow customization of the parser to be used. Certain parsers have
448 better support for Unicode than others.
449
450 As an example, providing C{["drv_libxml2"]} causes the libxml2 parser to
451 be used.
452
453 The default behavior if this function is not called, or if it is called
454 with an empty list or C{None}, is to provide no specific modules, which
455 will result in the system default parser (probably expat).
456
457 @param create_parser_modules: an iterable list of names of modules that
458 provide a C{create_parser} function. Pass C{None} to reset to the system
459 default. """
460 global _CreateParserModules
461 if create_parser_modules is None:
462 _CreateParserModules = []
463 else:
464 _CreateParserModules = list(create_parser_modules)
465
467 """Extend C{xml.sax.make_parser} to configure the parser the way we
468 need it:
469
470 - C{feature_namespaces} is set to C{True} so we process xmlns
471 directives properly
472 - C{feature_namespace_prefixes} is set to C{False} so we don't get
473 prefixes encoded into our names (probably redundant with the above but
474 still...)
475
476 All keywords not documented here (and C{fallback_namespace}, which is) are
477 passed to the C{content_handler_constructor} if that must be invoked.
478
479 @keyword content_handler: The content handler instance for the
480 parser to use. If not provided, an instance of C{content_handler_constructor}
481 is created and used.
482 @type content_handler: C{xml.sax.handler.ContentHandler}
483
484 @keyword content_handler_constructor: A callable which produces an
485 appropriate instance of (a subclass of) L{BaseSAXHandler}. The default is
486 L{BaseSAXHandler}.
487
488 @keyword fallback_namespace: The namespace to use for lookups of
489 unqualified names in absent namespaces; see
490 L{pyxb.namespace.ExpandedName}. This keyword is not used by this
491 function, but is passed to the C{content_handler_constructor}.
492 @type fallback_namespace: L{pyxb.namespace.Namespace}
493 """
494 content_handler_constructor = kw.pop('content_handler_constructor', BaseSAXHandler)
495 content_handler = kw.pop('content_handler', None)
496 if content_handler is None:
497 content_handler = content_handler_constructor(**kw)
498 parser = xml.sax.make_parser(_CreateParserModules)
499 parser.setFeature(xml.sax.handler.feature_namespaces, True)
500 parser.setFeature(xml.sax.handler.feature_namespace_prefixes, False)
501 parser.setContentHandler(content_handler)
502
503 try:
504 parser.setEntityResolver(_EntityResolver())
505 except xml.sax.SAXNotSupportedException:
506 pass
507 return parser
508
509 if '__main__' == __name__:
510 import xml.dom.pulldom
511 import xml.dom.minidom
512 import pyxb.utils.saxdom as saxdom
513 import time
514 import lxml.sax
515 import lxml.etree
516 import sys
517
518 Handler = BaseSAXHandler
519 xml_file = 'examples/tmsxtvd/tmsdatadirect_sample.xml'
520 if 1 < len(sys.argv):
521 xml_file = sys.argv[1]
522 xmld = open(xml_file, 'rb').read()
523
524 dt1 = time.time()
525 dt2 = time.time()
526 dom = xml.dom.minidom.parse(io.BytesIO(xmld))
527 dt3 = time.time()
528
529 snt1 = time.time()
530 saxer = make_parser(content_handler=_NoopSAXHandler())
531 snt2 = time.time()
532 saxer.parse(io.BytesIO(xmld))
533 snt3 = time.time()
534
535 sbt1 = time.time()
536 saxer = make_parser(content_handler=BaseSAXHandler())
537 sbt2 = time.time()
538 saxer.parse(io.BytesIO(xmld))
539 sbt3 = time.time()
540
541 pdt1 = time.time()
542 sdomer = make_parser(content_handler_constructor=saxdom._DOMSAXHandler)
543 h = sdomer.getContentHandler()
544 pdt2 = time.time()
545 sdomer.parse(io.BytesIO(xmld))
546 pdt3 = time.time()
547
548 lst1 = time.time()
549 tree = lxml.etree.fromstring(xmld)
550 lst2 = time.time()
551 lsh = Handler()
552 lxml.sax.saxify(tree, lsh)
553 lst3 = time.time()
554
555 ldt1 = time.time()
556 tree = lxml.etree.fromstring(xmld)
557 ldt2 = time.time()
558 ldh = xml.dom.pulldom.SAX2DOM()
559 lxml.sax.saxify(tree, ldh)
560 ldt3 = time.time()
561
562 print('minidom read %f, parse %f, total %f' % (dt2-dt1, dt3-dt2, dt3-dt1))
563 print('SAX+noop create %f, parse %f, total %f' % (snt2-snt1, snt3-snt2, snt3-snt1))
564 print('SAX+ns create %f, parse %f, total %f' % (sbt2-sbt1, sbt3-sbt2, sbt3-sbt1))
565 print('PyXB SAXDOM-based create %f, parse %f, total %f' % (pdt2-pdt1, pdt3-pdt2, pdt3-pdt1))
566 print('LXML+SAX tree %f, parse %f, total %f' % (lst2-lst1, lst3-lst2, lst3-lst1))
567 print('LXML+pulldom DOM tree %f, parse %f, total %f' % (ldt2-ldt1, ldt3-ldt2, ldt3-ldt1))
568
569
570
571
572