1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 """This module contains support for processing XML using a SAX parser.
16
17 In particular, it provides a L{base content handler class<BaseSAXHandler>}
18 that maintains namespace context and element state in a stack; and a L{base
19 element state class <SAXElementState>} which records the location of the
20 element in the stream. These classes are extended for specific parsing needs
21 (e.g., L{pyxb.binding.saxer}).
22 """
23
24 import xml.sax
25 import xml.sax.handler
26 import pyxb.namespace
27
29 """A SAX handler class which prints each method invocation.
30 """
31
32
33 __trace = False
34
36 print 'setDocumentLocator %s' % (locator,)
37
40
42 print 'startPrefixMapping %s %s' % (prefix, uri)
43
45 print 'endPrefixMapping %s' % (prefix,)
46
48 print 'startElementNS %s %s' % (name, qname)
49
51 print 'endElementNS %s %s' % (name, qname)
52
54 print 'characters %s' % (content,)
55
57 print 'ignorableWhitespace len %d' % (len(whitespace),)
58
60 print 'processingInstruction %s' % (data,)
61
63 """A SAX handler class which doesn't do anything. Used to get baseline
64 performance parsing a particular document.
65 """
66
69
72
75
78
81
84
87
90
93
94
96 """State corresponding to processing a given element with the SAX
97 model."""
98
100 """Reference to the SAXElementState of the element enclosing this
101 one."""
102 return self.__parentState
103 __parentState = None
104
105 - def namespaceContext (self):
106 """The L{pyxb.namespace.resolution.NamespaceContext} used for this
107 binding."""
108 return self.__namespaceContext
109 __namespaceContext = None
110
112 """The L{expanded name<pyxb.namespace.ExpandedName>} of the
113 element."""
114 return self.__expandedName
115 __expandedName = None
116
118 """The L{location<pyxb.utils.utility.Location>} corresponding to the
119 element event."""
120 return self.__location
121 __location = None
122
123 - def content (self):
124 """An accumulation of content to be supplied to the content model when
125 the element end is reached.
126
127 This is a list, with each member being C{(content, element_use,
128 maybe_element)}. C{content} is text or a binding instance;
129 C{element_use} is C{None} or the
130 L{ElementUse<pyxb.binding.content.ElementUse>} instance used to create
131 the content; and C{maybe_element} is C{True} iff the content is
132 non-content text."""
133 return self.__content
134 __content = None
135
142
143 - def addTextContent (self, content):
144 """Add the given text as non-element content of the current element.
145 @type content: C{unicode} or C{str}
146 @return: C{self}
147 """
148 self.__content.append( (content, None, False) )
149
150 - def addElementContent (self, element, element_use):
151 """Add the given binding instance as element content correspondidng to
152 the given use.
153
154 @param element: Any L{binding instance<pyxb.binding.basis._TypeBinding_mixin>}.
155
156 @param element_use: The L{element
157 use<pyxb.binding.content.ElementUse>} in the containing complex type.
158 """
159 self.__content.append( (element, element_use, True) )
160
162 """A SAX handler class that maintains a stack of enclosing elements and
163 manages namespace declarations.
164
165 This is the base for L{pyxb.utils.saxdom._DOMSAXHandler} and
166 L{pyxb.binding.saxer.PyXBSAXHandler}.
167 """
168
169
170
171 __locationTemplate = None
172
173
174
175
176 __elementStateConstructor = None
177
178
179
180 __fallbackNamespace = None
181
182
183
184
185
186
187 __nextNamespaceContext = None
188
189
190 - def namespaceContext (self):
191 """Return the namespace context used for QName resolution within the
192 current element.
193
194 @return: An instance of L{pyxb.namespace.resolution.NamespaceContext}"""
195 return self.__namespaceContext
196 __namespaceContext = None
197
198
199
200
201
202 __includingContext = None
203
204
205
206 __locator = None
207
208
211 __elementState = None
212
213
214 __elementStateStack = []
215
217 """Return the binding object corresponding to the top-most
218 element in the document
219
220 @return: An instance of L{basis._TypeBinding_mixin} (most usually a
221 L{basis.complexTypeDefinition}."""
222 return self.__rootObject
223 __rootObject = None
224
242
244 """Create a new C{xml.sax.handler.ContentHandler} instance to maintain state relevant to elements.
245
246 @keyword fallback_namespace: Optional namespace to use for unqualified
247 names with no default namespace in scope. Has no effect unless it is
248 an absent namespace.
249
250 @keyword element_state_constructor: Optional callable object that
251 creates instances of L{SAXElementState} that hold element-specific
252 information. Defaults to L{SAXElementState}.
253
254 @keyword target_namespace: Optional namespace to set as the target
255 namespace. If not provided, there is no target namespace (not even an
256 absent one). This is the appropriate situation when processing plain
257 XML documents.
258
259 @keyword location_base: An object to be recorded as the base of all
260 L{pyxb.utils.utility.Location} instances associated with events and
261 objects handled by the parser.
262 """
263 self.__includingContext = kw.pop('including_context', None)
264 self.__fallbackNamespace = kw.pop('fallback_namespace', None)
265 self.__elementStateConstructor = kw.pop('element_state_constructor', SAXElementState)
266 self.__targetNamespace = kw.pop('target_namespace', None)
267 self.__locationTemplate = pyxb.utils.utility.Location(kw.pop('location_base', None))
268
269
270
276
278 """Save the locator object."""
279 self.__locator = locator
280
282 """Process the start of a document.
283
284 This resets this handler for a new document.
285 @note: setDocumentLocator is invoked before startDocument
286 """
287 self.reset()
288
295
296
297
298
299
300
301
330
343
344
345
346
347
348
349 __pendingText = None
354
358
362
365
366 import StringIO
368 """Dummy used to prevent the SAX parser from crashing when it sees
369 processing instructions that we dont' care about."""
371 return StringIO.StringIO('')
372
374 """Extend C{xml.sax.make_parser} to configure the parser the way we
375 need it:
376
377 - C{feature_namespaces} is set to C{True} so we process xmlns
378 directives properly
379 - C{feature_namespace_prefixes} is set to C{False} so we don't get
380 prefixes encoded into our names (probably redundant with the above but
381 still...)
382
383 All arguments not documented here are passed to C{xml.sax.make_parser}.
384
385 All keywords not documented here (and C{fallback_namespace}, which is) are
386 passed to the C{content_handler_constructor} if that must be invoked.
387
388 @keyword content_handler: The content handler instance for the
389 parser to use. If not provided, an instance of C{content_handler_constructor}
390 is created and used.
391 @type content_handler: C{xml.sax.handler.ContentHandler}
392
393 @keyword content_handler_constructor: A callable which produces an
394 appropriate instance of (a subclass of) L{BaseSAXHandler}. The default is
395 L{BaseSAXHandler}.
396
397 @keyword fallback_namespace: The namespace to use for lookups of
398 unqualified names in absent namespaces; see
399 L{pyxb.namespace.ExpandedName}. This keyword is not used by this
400 function, but is passed to the C{content_handler_constructor}.
401 @type fallback_namespace: L{pyxb.namespace.Namespace}
402 """
403 content_handler_constructor = kw.pop('content_handler_constructor', BaseSAXHandler)
404 content_handler = kw.pop('content_handler', None)
405 if content_handler is None:
406 content_handler = content_handler_constructor(**kw)
407 parser = xml.sax.make_parser(*args)
408 parser.setFeature(xml.sax.handler.feature_namespaces, True)
409 parser.setFeature(xml.sax.handler.feature_namespace_prefixes, False)
410 parser.setContentHandler(content_handler)
411 parser.setEntityResolver(_EntityResolver())
412 return parser
413
414 if '__main__' == __name__:
415 import xml.dom.pulldom
416 import pyxb.utils.domutils as domutils
417 import pyxb.utils.saxdom as saxdom
418 import time
419 import lxml.sax
420 import lxml.etree
421 import StringIO
422 import sys
423
424 Handler = BaseSAXHandler
425 xml_file = '/home/pab/pyxb/dev/examples/tmsxtvd/tmsdatadirect_sample.xml'
426 if 1 < len(sys.argv):
427 xml_file = sys.argv[1]
428 xmls = open(xml_file).read()
429
430 dt1 = time.time()
431 dt2 = time.time()
432 dom = xml.dom.minidom.parseString(xmls)
433 dt3 = time.time()
434
435 snt1 = time.time()
436 saxer = make_parser(content_handler=_NoopSAXHandler())
437 snt2 = time.time()
438 saxer.parse(StringIO.StringIO(xmls))
439 snt3 = time.time()
440
441 sbt1 = time.time()
442 saxer = make_parser(content_handler=BaseSAXHandler())
443 sbt2 = time.time()
444 saxer.parse(StringIO.StringIO(xmls))
445 sbt3 = time.time()
446
447 pdt1 = time.time()
448 sdomer = make_parser(content_handler_constructor=saxdom._DOMSAXHandler)
449 h = sdomer.getContentHandler()
450 pdt2 = time.time()
451 sdomer.parse(StringIO.StringIO(xmls))
452 pdt3 = time.time()
453
454 lst1 = time.time()
455 tree = lxml.etree.fromstring(xmls)
456 lst2 = time.time()
457 lsh = Handler()
458 lxml.sax.saxify(tree, lsh)
459 lst3 = time.time()
460
461 ldt1 = time.time()
462 tree = lxml.etree.fromstring(xmls)
463 ldt2 = time.time()
464 ldh = xml.dom.pulldom.SAX2DOM()
465 lxml.sax.saxify(tree, ldh)
466 ldt3 = time.time()
467
468 print 'minidom read %f, parse %f, total %f' % (dt2-dt1, dt3-dt2, dt3-dt1)
469 print 'SAX+noop create %f, parse %f, total %f' % (snt2-snt1, snt3-snt2, snt3-snt1)
470 print 'SAX+ns create %f, parse %f, total %f' % (sbt2-sbt1, sbt3-sbt2, sbt3-sbt1)
471 print 'PyXB SAXDOM-based create %f, parse %f, total %f' % (pdt2-pdt1, pdt3-pdt2, pdt3-pdt1)
472 print 'LXML+SAX tree %f, parse %f, total %f' % (lst2-lst1, lst3-lst2, lst3-lst1)
473 print 'LXML+pulldom DOM tree %f, parse %f, total %f' % (ldt2-ldt1, ldt3-ldt2, ldt3-ldt1)
474
475
476
477
478