1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16 """This module contains support for processing XML using a SAX parser.
17
18 In particular, it provides a L{base content handler class<BaseSAXHandler>}
19 that maintains namespace context and element state in a stack; and a L{base
20 element state class <SAXElementState>} which records the location of the
21 element in the stream. These classes are extended for specific parsing needs
22 (e.g., L{pyxb.binding.saxer}).
23 """
24
25 import xml.sax
26 import xml.sax.handler
27 import pyxb.namespace
28 import StringIO
29 import logging
30
31 _log = logging.getLogger(__name__)
32
34 """A SAX handler class which prints each method invocation.
35 """
36
37
38 __trace = False
39
41 _log.debug('setDocumentLocator %s', locator)
42
44 _log.debug('startDocument')
45
48
51
53 _log.debug('startElementNS %s %s', name, qname)
54
56 _log.debug('endElementNS %s %s', name, qname)
57
60
62 _log.debug('ignorableWhitespace len %d', len(whitespace))
63
65 _log.debug('processingInstruction %s %s', target, data)
66
68 """A SAX handler class which doesn't do anything. Used to get baseline
69 performance parsing a particular document.
70 """
71
74
77
80
83
86
89
92
95
98
99
101 """State corresponding to processing a given element with the SAX
102 model."""
103
105 """Reference to the SAXElementState of the element enclosing this
106 one."""
107 return self.__parentState
108 __parentState = None
109
110 - def namespaceContext (self):
111 """The L{pyxb.namespace.resolution.NamespaceContext} used for this
112 binding."""
113 return self.__namespaceContext
114 __namespaceContext = None
115
117 """The L{expanded name<pyxb.namespace.ExpandedName>} of the
118 element."""
119 return self.__expandedName
120 __expandedName = None
121
123 """The L{location<pyxb.utils.utility.Location>} corresponding to the
124 element event."""
125 return self.__location
126 __location = None
127
128 - def content (self):
129 """An accumulation of content to be supplied to the content model when
130 the element end is reached.
131
132 This is a list, with each member being C{(content, element_use,
133 maybe_element)}. C{content} is text or a binding instance;
134 C{element_use} is C{None} or the
135 L{ElementUse<pyxb.binding.content.ElementUse>} instance used to create
136 the content; and C{maybe_element} is C{True} iff the content is
137 non-content text."""
138 return self.__content
139 __content = None
140
147
148 - def addTextContent (self, content):
149 """Add the given text as non-element content of the current element.
150 @type content: C{unicode} or C{str}
151 @return: C{self}
152 """
153 self.__content.append( (content, None, False) )
154
155 - def addElementContent (self, element, element_use):
156 """Add the given binding instance as element content correspondidng to
157 the given use.
158
159 @param element: Any L{binding instance<pyxb.binding.basis._TypeBinding_mixin>}.
160
161 @param element_use: The L{element
162 use<pyxb.binding.content.ElementUse>} in the containing complex type.
163 """
164 self.__content.append( (element, element_use, True) )
165
167 """A SAX handler class that maintains a stack of enclosing elements and
168 manages namespace declarations.
169
170 This is the base for L{pyxb.utils.saxdom._DOMSAXHandler} and
171 L{pyxb.binding.saxer.PyXBSAXHandler}.
172 """
173
174
175
176 __locationTemplate = None
177
178
179
180
181 __elementStateConstructor = None
182
183
184
186 """Return the namespace used to resolve unqualified names with no default namespace."""
187 return self.__fallbackNamespace
188 __fallbackNamespace = None
189
190
191
192
193
194
195 __nextNamespaceContext = None
196
197
198 - def namespaceContext (self):
199 """Return the namespace context used for QName resolution within the
200 current element.
201
202 @return: An instance of L{pyxb.namespace.resolution.NamespaceContext}"""
203 return self.__namespaceContext
204 __namespaceContext = None
205
206
207
208
209
210 __includingContext = None
211
212
213
214 __locator = None
215
216
219 __elementState = None
220
221
222 __elementStateStack = []
223
225 """Return the binding object corresponding to the top-most
226 element in the document
227
228 @return: An instance of L{basis._TypeBinding_mixin} (most usually a
229 L{basis.complexTypeDefinition}."""
230 return self.__rootObject
231 __rootObject = None
232
250
252 """Create a new C{xml.sax.handler.ContentHandler} instance to maintain state relevant to elements.
253
254 @keyword fallback_namespace: Optional namespace to use for unqualified
255 names with no default namespace in scope. Has no effect unless it is
256 an absent namespace.
257
258 @keyword element_state_constructor: Optional callable object that
259 creates instances of L{SAXElementState} that hold element-specific
260 information. Defaults to L{SAXElementState}.
261
262 @keyword target_namespace: Optional namespace to set as the target
263 namespace. If not provided, there is no target namespace (not even an
264 absent one). This is the appropriate situation when processing plain
265 XML documents.
266
267 @keyword location_base: An object to be recorded as the base of all
268 L{pyxb.utils.utility.Location} instances associated with events and
269 objects handled by the parser.
270 """
271 self.__includingContext = kw.pop('including_context', None)
272 self.__fallbackNamespace = kw.pop('fallback_namespace', None)
273 self.__elementStateConstructor = kw.pop('element_state_constructor', SAXElementState)
274 self.__targetNamespace = kw.pop('target_namespace', None)
275 self.__locationTemplate = pyxb.utils.utility.Location(kw.pop('location_base', None))
276
277
278
284
286 """Save the locator object."""
287 self.__locator = locator
288
290 """Process the start of a document.
291
292 This resets this handler for a new document.
293 @note: setDocumentLocator is invoked before startDocument
294 """
295 self.reset()
296
303
304
305
306
307
308
337
350
351
352
353
354
355
356 __pendingText = None
361
365
369
372
374 """Dummy used to prevent the SAX parser from crashing when it sees
375 processing instructions that we don't care about."""
377 return StringIO.StringIO('')
378
379 _CreateParserModules = []
381 """Provide list of modules to be used when creating parsers.
382
383 C{xml.sax.make_parser()} takes as a parameter an optional list of modules
384 which allow customization of the parser to be used. Certain parsers have
385 better support for Unicode than others.
386
387 As an example, providing C{["drv_libxml2"]} causes the libxml2 parser to
388 be used.
389
390 The default behavior if this function is not called, or if it is called
391 with an empty list or C{None}, is to provide no specific modules, which
392 will result in the system default parser (probably expat).
393
394 @param create_parser_modules: an iterable list of names of modules that
395 provide a C{create_parser} function. Pass C{None} to reset to the system
396 default. """
397 global _CreateParserModules
398 if create_parser_modules is None:
399 _CreateParserModules = []
400 else:
401 _CreateParserModules = list(create_parser_modules)
402
404 """Extend C{xml.sax.make_parser} to configure the parser the way we
405 need it:
406
407 - C{feature_namespaces} is set to C{True} so we process xmlns
408 directives properly
409 - C{feature_namespace_prefixes} is set to C{False} so we don't get
410 prefixes encoded into our names (probably redundant with the above but
411 still...)
412
413 All keywords not documented here (and C{fallback_namespace}, which is) are
414 passed to the C{content_handler_constructor} if that must be invoked.
415
416 @keyword content_handler: The content handler instance for the
417 parser to use. If not provided, an instance of C{content_handler_constructor}
418 is created and used.
419 @type content_handler: C{xml.sax.handler.ContentHandler}
420
421 @keyword content_handler_constructor: A callable which produces an
422 appropriate instance of (a subclass of) L{BaseSAXHandler}. The default is
423 L{BaseSAXHandler}.
424
425 @keyword fallback_namespace: The namespace to use for lookups of
426 unqualified names in absent namespaces; see
427 L{pyxb.namespace.ExpandedName}. This keyword is not used by this
428 function, but is passed to the C{content_handler_constructor}.
429 @type fallback_namespace: L{pyxb.namespace.Namespace}
430 """
431 content_handler_constructor = kw.pop('content_handler_constructor', BaseSAXHandler)
432 content_handler = kw.pop('content_handler', None)
433 if content_handler is None:
434 content_handler = content_handler_constructor(**kw)
435 parser = xml.sax.make_parser(_CreateParserModules)
436 parser.setFeature(xml.sax.handler.feature_namespaces, True)
437 parser.setFeature(xml.sax.handler.feature_namespace_prefixes, False)
438 parser.setContentHandler(content_handler)
439
440 try:
441 parser.setEntityResolver(_EntityResolver())
442 except xml.sax.SAXNotSupportedException:
443 pass
444 return parser
445
446 if '__main__' == __name__:
447 import xml.dom.pulldom
448 import pyxb.utils.saxdom as saxdom
449 import time
450 import lxml.sax
451 import lxml.etree
452 import sys
453
454 Handler = BaseSAXHandler
455 xml_file = 'examples/tmsxtvd/tmsdatadirect_sample.xml'
456 if 1 < len(sys.argv):
457 xml_file = sys.argv[1]
458 xmls = open(xml_file).read()
459
460 dt1 = time.time()
461 dt2 = time.time()
462 dom = xml.dom.minidom.parseString(xmls)
463 dt3 = time.time()
464
465 snt1 = time.time()
466 saxer = make_parser(content_handler=_NoopSAXHandler())
467 snt2 = time.time()
468 saxer.parse(StringIO.StringIO(xmls))
469 snt3 = time.time()
470
471 sbt1 = time.time()
472 saxer = make_parser(content_handler=BaseSAXHandler())
473 sbt2 = time.time()
474 saxer.parse(StringIO.StringIO(xmls))
475 sbt3 = time.time()
476
477 pdt1 = time.time()
478 sdomer = make_parser(content_handler_constructor=saxdom._DOMSAXHandler)
479 h = sdomer.getContentHandler()
480 pdt2 = time.time()
481 sdomer.parse(StringIO.StringIO(xmls))
482 pdt3 = time.time()
483
484 lst1 = time.time()
485 tree = lxml.etree.fromstring(xmls)
486 lst2 = time.time()
487 lsh = Handler()
488 lxml.sax.saxify(tree, lsh)
489 lst3 = time.time()
490
491 ldt1 = time.time()
492 tree = lxml.etree.fromstring(xmls)
493 ldt2 = time.time()
494 ldh = xml.dom.pulldom.SAX2DOM()
495 lxml.sax.saxify(tree, ldh)
496 ldt3 = time.time()
497
498 print 'minidom read %f, parse %f, total %f' % (dt2-dt1, dt3-dt2, dt3-dt1)
499 print 'SAX+noop create %f, parse %f, total %f' % (snt2-snt1, snt3-snt2, snt3-snt1)
500 print 'SAX+ns create %f, parse %f, total %f' % (sbt2-sbt1, sbt3-sbt2, sbt3-sbt1)
501 print 'PyXB SAXDOM-based create %f, parse %f, total %f' % (pdt2-pdt1, pdt3-pdt2, pdt3-pdt1)
502 print 'LXML+SAX tree %f, parse %f, total %f' % (lst2-lst1, lst3-lst2, lst3-lst1)
503 print 'LXML+pulldom DOM tree %f, parse %f, total %f' % (ldt2-ldt1, ldt3-ldt2, ldt3-ldt1)
504
505
506
507
508