1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16 """This module contains support for processing XML using a SAX parser.
17
18 In particular, it provides a L{base content handler class<BaseSAXHandler>}
19 that maintains namespace context and element state in a stack; and a L{base
20 element state class <SAXElementState>} which records the location of the
21 element in the stream. These classes are extended for specific parsing needs
22 (e.g., L{pyxb.binding.saxer}).
23 """
24
25 import xml.sax
26 import xml.sax.handler
27 import pyxb.namespace
28 import StringIO
29 import logging
30
31 _log = logging.getLogger(__name__)
32
34 """A SAX handler class which prints each method invocation.
35 """
36
37
38 __trace = False
39
41 _log.debug('setDocumentLocator %s', locator)
42
44 _log.debug('startDocument')
45
48
51
53 _log.debug('startElementNS %s %s', name, qname)
54
56 _log.debug('endElementNS %s %s', name, qname)
57
60
62 _log.debug('ignorableWhitespace len %d', len(whitespace))
63
65 _log.debug('processingInstruction %s %s', target, data)
66
68 """A SAX handler class which doesn't do anything. Used to get baseline
69 performance parsing a particular document.
70 """
71
74
77
80
83
86
89
92
95
98
123
125 """State corresponding to processing a given element with the SAX
126 model."""
127
128 - def contentHandler (self):
129 """Reference to the C{xml.sxa.handler.ContentHandler} that is processing the document."""
130 return self.__contentHandler
131 __contentHandler = None
132
134 """Reference to the SAXElementState of the element enclosing this
135 one."""
136 return self.__parentState
137 __parentState = None
138
139 - def namespaceContext (self):
140 """The L{pyxb.namespace.resolution.NamespaceContext} used for this
141 binding."""
142 return self.__namespaceContext
143 __namespaceContext = None
144
146 """The L{expanded name<pyxb.namespace.ExpandedName>} of the
147 element."""
148 return self.__expandedName
149 __expandedName = None
150
152 """The L{location<pyxb.utils.utility.Location>} corresponding to the
153 element event."""
154 return self.__location
155 __location = None
156
157 - def content (self):
158 """An accumulation of content to be supplied to the content model when
159 the element end is reached.
160
161 This is a list, with each member being C{(content, element_use,
162 maybe_element)}. C{content} is text or a binding instance;
163 C{element_use} is C{None} or the
164 L{ElementDeclaration<pyxb.binding.content.ElementDeclaration>} instance used to create
165 the content; and C{maybe_element} is C{True} iff the content is
166 non-content text."""
167 return self.__content
168 __content = None
169
178
179 - def addTextContent (self, location, content):
180 """Add the given text as non-element content of the current element.
181 @type content: C{unicode} or C{str}
182 @return: C{self}
183 """
184 self.__content.append(SAXInformationItem(location, content, False))
185
186 - def addElementContent (self, location, element, element_decl=None):
187 """Add the given binding instance as element content corresponding to
188 the given use.
189
190 @param element: Any L{binding instance<pyxb.binding.basis._TypeBinding_mixin>}.
191
192 @param element_decl: The L{element
193 use<pyxb.binding.content.ElementDeclaration>} in the containing complex type.
194 """
195 self.__content.append(SAXInformationItem(location, element, True, element_decl))
196
198 """A SAX handler class that maintains a stack of enclosing elements and
199 manages namespace declarations.
200
201 This is the base for L{pyxb.utils.saxdom._DOMSAXHandler} and
202 L{pyxb.binding.saxer.PyXBSAXHandler}.
203 """
204
205
206
207 __locationTemplate = None
208
212
213
214
215
216 __elementStateConstructor = None
217
218
219
221 """Return the namespace used to resolve unqualified names with no default namespace."""
222 return self.__fallbackNamespace
223 __fallbackNamespace = None
224
225
226
227
228
229
230 __nextNamespaceContext = None
231
232
233 - def namespaceContext (self):
234 """Return the namespace context used for QName resolution within the
235 current element.
236
237 @return: An instance of L{pyxb.namespace.resolution.NamespaceContext}"""
238 return self.__namespaceContext
239 __namespaceContext = None
240
241
242
243
244
245 __includingContext = None
246
247
248
249 __locator = None
250
251
254 __elementState = None
255
256
257 __elementStateStack = []
258
260 """Return the binding object corresponding to the top-most
261 element in the document
262
263 @return: An instance of L{basis._TypeBinding_mixin} (most usually a
264 L{basis.complexTypeDefinition}."""
265 return self.__rootObject
266 __rootObject = None
267
286
288 """Create a new C{xml.sax.handler.ContentHandler} instance to maintain state relevant to elements.
289
290 @keyword fallback_namespace: Optional namespace to use for unqualified
291 names with no default namespace in scope. Has no effect unless it is
292 an absent namespace.
293
294 @keyword element_state_constructor: Optional callable object that
295 creates instances of L{SAXElementState} that hold element-specific
296 information. Defaults to L{SAXElementState}.
297
298 @keyword target_namespace: Optional namespace to set as the target
299 namespace. If not provided, there is no target namespace (not even an
300 absent one). This is the appropriate situation when processing plain
301 XML documents.
302
303 @keyword location_base: An object to be recorded as the base of all
304 L{pyxb.utils.utility.Location} instances associated with events and
305 objects handled by the parser.
306 """
307 self.__includingContext = kw.pop('including_context', None)
308 self.__fallbackNamespace = kw.pop('fallback_namespace', None)
309 self.__elementStateConstructor = kw.pop('element_state_constructor', SAXElementState)
310 self.__targetNamespace = kw.pop('target_namespace', None)
311 self.__locationTemplate = pyxb.utils.utility.Location(kw.pop('location_base', None))
312
313
314
320
322 """Save the locator object."""
323 self.__locator = locator
324
326 """Process the start of a document.
327
328 This resets this handler for a new document.
329 @note: setDocumentLocator is invoked before startDocument
330 """
331 self.reset()
332
339
340
341
342
343
344
373
386
387
388
389
390
391
392 __pendingText = None
393 __pendingTextLocation = None
402
408
412
415
417 """Dummy used to prevent the SAX parser from crashing when it sees
418 processing instructions that we don't care about."""
420 return StringIO.StringIO('')
421
422 _CreateParserModules = []
424 """Provide list of modules to be used when creating parsers.
425
426 C{xml.sax.make_parser()} takes as a parameter an optional list of modules
427 which allow customization of the parser to be used. Certain parsers have
428 better support for Unicode than others.
429
430 As an example, providing C{["drv_libxml2"]} causes the libxml2 parser to
431 be used.
432
433 The default behavior if this function is not called, or if it is called
434 with an empty list or C{None}, is to provide no specific modules, which
435 will result in the system default parser (probably expat).
436
437 @param create_parser_modules: an iterable list of names of modules that
438 provide a C{create_parser} function. Pass C{None} to reset to the system
439 default. """
440 global _CreateParserModules
441 if create_parser_modules is None:
442 _CreateParserModules = []
443 else:
444 _CreateParserModules = list(create_parser_modules)
445
447 """Extend C{xml.sax.make_parser} to configure the parser the way we
448 need it:
449
450 - C{feature_namespaces} is set to C{True} so we process xmlns
451 directives properly
452 - C{feature_namespace_prefixes} is set to C{False} so we don't get
453 prefixes encoded into our names (probably redundant with the above but
454 still...)
455
456 All keywords not documented here (and C{fallback_namespace}, which is) are
457 passed to the C{content_handler_constructor} if that must be invoked.
458
459 @keyword content_handler: The content handler instance for the
460 parser to use. If not provided, an instance of C{content_handler_constructor}
461 is created and used.
462 @type content_handler: C{xml.sax.handler.ContentHandler}
463
464 @keyword content_handler_constructor: A callable which produces an
465 appropriate instance of (a subclass of) L{BaseSAXHandler}. The default is
466 L{BaseSAXHandler}.
467
468 @keyword fallback_namespace: The namespace to use for lookups of
469 unqualified names in absent namespaces; see
470 L{pyxb.namespace.ExpandedName}. This keyword is not used by this
471 function, but is passed to the C{content_handler_constructor}.
472 @type fallback_namespace: L{pyxb.namespace.Namespace}
473 """
474 content_handler_constructor = kw.pop('content_handler_constructor', BaseSAXHandler)
475 content_handler = kw.pop('content_handler', None)
476 if content_handler is None:
477 content_handler = content_handler_constructor(**kw)
478 parser = xml.sax.make_parser(_CreateParserModules)
479 parser.setFeature(xml.sax.handler.feature_namespaces, True)
480 parser.setFeature(xml.sax.handler.feature_namespace_prefixes, False)
481 parser.setContentHandler(content_handler)
482
483 try:
484 parser.setEntityResolver(_EntityResolver())
485 except xml.sax.SAXNotSupportedException:
486 pass
487 return parser
488
489 if '__main__' == __name__:
490 import xml.dom.pulldom
491 import xml.dom.minidom
492 import pyxb.utils.saxdom as saxdom
493 import time
494 import lxml.sax
495 import lxml.etree
496 import sys
497
498 Handler = BaseSAXHandler
499 xml_file = 'examples/tmsxtvd/tmsdatadirect_sample.xml'
500 if 1 < len(sys.argv):
501 xml_file = sys.argv[1]
502 xmls = open(xml_file).read()
503
504 dt1 = time.time()
505 dt2 = time.time()
506 dom = xml.dom.minidom.parseString(xmls)
507 dt3 = time.time()
508
509 snt1 = time.time()
510 saxer = make_parser(content_handler=_NoopSAXHandler())
511 snt2 = time.time()
512 saxer.parse(StringIO.StringIO(xmls))
513 snt3 = time.time()
514
515 sbt1 = time.time()
516 saxer = make_parser(content_handler=BaseSAXHandler())
517 sbt2 = time.time()
518 saxer.parse(StringIO.StringIO(xmls))
519 sbt3 = time.time()
520
521 pdt1 = time.time()
522 sdomer = make_parser(content_handler_constructor=saxdom._DOMSAXHandler)
523 h = sdomer.getContentHandler()
524 pdt2 = time.time()
525 sdomer.parse(StringIO.StringIO(xmls))
526 pdt3 = time.time()
527
528 lst1 = time.time()
529 tree = lxml.etree.fromstring(xmls)
530 lst2 = time.time()
531 lsh = Handler()
532 lxml.sax.saxify(tree, lsh)
533 lst3 = time.time()
534
535 ldt1 = time.time()
536 tree = lxml.etree.fromstring(xmls)
537 ldt2 = time.time()
538 ldh = xml.dom.pulldom.SAX2DOM()
539 lxml.sax.saxify(tree, ldh)
540 ldt3 = time.time()
541
542 print 'minidom read %f, parse %f, total %f' % (dt2-dt1, dt3-dt2, dt3-dt1)
543 print 'SAX+noop create %f, parse %f, total %f' % (snt2-snt1, snt3-snt2, snt3-snt1)
544 print 'SAX+ns create %f, parse %f, total %f' % (sbt2-sbt1, sbt3-sbt2, sbt3-sbt1)
545 print 'PyXB SAXDOM-based create %f, parse %f, total %f' % (pdt2-pdt1, pdt3-pdt2, pdt3-pdt1)
546 print 'LXML+SAX tree %f, parse %f, total %f' % (lst2-lst1, lst3-lst2, lst3-lst1)
547 print 'LXML+pulldom DOM tree %f, parse %f, total %f' % (ldt2-ldt1, ldt3-ldt2, ldt3-ldt1)
548
549
550
551
552