Package pyxb :: Package utils :: Module saxutils
[hide private]
[frames] | no frames]

Source Code for Module pyxb.utils.saxutils

  1  # Copyright 2009, Peter A. Bigot 
  2  # 
  3  # Licensed under the Apache License, Version 2.0 (the "License"); you may 
  4  # not use this file except in compliance with the License. You may obtain a 
  5  # copy of the License at: 
  6  # 
  7  #            http://www.apache.org/licenses/LICENSE-2.0 
  8  # 
  9  # Unless required by applicable law or agreed to in writing, software 
 10  # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 
 11  # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 
 12  # License for the specific language governing permissions and limitations 
 13  # under the License. 
 14   
 15  """This module contains support for processing XML using a SAX parser. 
 16   
 17  In particular, it provides a L{base content handler class<BaseSAXHandler>} 
 18  that maintains namespace context and element state in a stack; and a L{base 
 19  element state class <SAXElementState>} which records the location of the 
 20  element in the stream.  These classes are extended for specific parsing needs 
 21  (e.g., L{pyxb.binding.saxer}). 
 22  """ 
 23   
 24  import xml.sax 
 25  import xml.sax.handler 
 26  import pyxb.namespace 
 27   
28 -class TracingSAXHandler (xml.sax.handler.ContentHandler):
29 """A SAX handler class which prints each method invocation. 30 """ 31 32 # Whether invocation of handler methods should be traced 33 __trace = False 34
35 - def setDocumentLocator (self, locator):
36 print 'setDocumentLocator %s' % (locator,)
37
38 - def startDocument (self):
39 print 'startDocument'
40
41 - def startPrefixMapping (self, prefix, uri):
42 print 'startPrefixMapping %s %s' % (prefix, uri)
43
44 - def endPrefixMapping (self, prefix):
45 print 'endPrefixMapping %s' % (prefix,)
46
47 - def startElementNS (self, name, qname, attrs):
48 print 'startElementNS %s %s' % (name, qname)
49
50 - def endElementNS (self, name, qname):
51 print 'endElementNS %s %s' % (name, qname)
52
53 - def characters (self, content):
54 print 'characters %s' % (content,)
55
56 - def ignorableWhitespace (self, whitespace):
57 print 'ignorableWhitespace len %d' % (len(whitespace),)
58
59 - def processingInstruction (self, data):
60 print 'processingInstruction %s' % (data,)
61
62 -class _NoopSAXHandler (xml.sax.handler.ContentHandler):
63 """A SAX handler class which doesn't do anything. Used to get baseline 64 performance parsing a particular document. 65 """ 66
67 - def setDocumentLocator (self, locator):
68 pass
69
70 - def startDocument (self):
71 pass
72
73 - def startPrefixMapping (self, prefix, uri):
74 pass
75
76 - def endPrefixMapping (self, prefix):
77 pass
78
79 - def startElementNS (self, name, qname, attrs):
80 pass
81
82 - def endElementNS (self, name, qname):
83 pass
84
85 - def characters (self, content):
86 pass
87
88 - def ignorableWhitespace (self, whitespace):
89 pass
90
91 - def processingInstruction (self, data):
92 pass
93 94
95 -class SAXElementState (object):
96 """State corresponding to processing a given element with the SAX 97 model.""" 98
99 - def parentState (self):
100 """Reference to the SAXElementState of the element enclosing this 101 one.""" 102 return self.__parentState
103 __parentState = None 104
105 - def namespaceContext (self):
106 """The L{pyxb.namespace.resolution.NamespaceContext} used for this 107 binding.""" 108 return self.__namespaceContext
109 __namespaceContext = None 110
111 - def expandedName (self):
112 """The L{expanded name<pyxb.namespace.ExpandedName>} of the 113 element.""" 114 return self.__expandedName
115 __expandedName = None 116
117 - def location (self):
118 """The L{location<pyxb.utils.utility.Location>} corresponding to the 119 element event.""" 120 return self.__location
121 __location = None 122
123 - def content (self):
124 """An accumulation of content to be supplied to the content model when 125 the element end is reached. 126 127 This is a list, with each member being C{(content, element_use, 128 maybe_element)}. C{content} is text or a binding instance; 129 C{element_use} is C{None} or the 130 L{ElementUse<pyxb.binding.content.ElementUse>} instance used to create 131 the content; and C{maybe_element} is C{True} iff the content is 132 non-content text.""" 133 return self.__content
134 __content = None 135
136 - def __init__ (self, **kw):
137 self.__expandedName = kw.get('expanded_name', None) 138 self.__namespaceContext = kw['namespace_context'] 139 self.__parentState = kw.get('parent_state', None) 140 self.__location = kw.get('location', None) 141 self.__content = []
142
143 - def addTextContent (self, content):
144 """Add the given text as non-element content of the current element. 145 @type content: C{unicode} or C{str} 146 @return: C{self} 147 """ 148 self.__content.append( (content, None, False) )
149
150 - def addElementContent (self, element, element_use):
151 """Add the given binding instance as element content correspondidng to 152 the given use. 153 154 @param element: Any L{binding instance<pyxb.binding.basis._TypeBinding_mixin>}. 155 156 @param element_use: The L{element 157 use<pyxb.binding.content.ElementUse>} in the containing complex type. 158 """ 159 self.__content.append( (element, element_use, True) )
160
161 -class BaseSAXHandler (xml.sax.handler.ContentHandler, object):
162 """A SAX handler class that maintains a stack of enclosing elements and 163 manages namespace declarations. 164 165 This is the base for L{pyxb.utils.saxdom._DOMSAXHandler} and 166 L{pyxb.binding.saxer.PyXBSAXHandler}. 167 """ 168 169 # An instance of L{pyxb.utils.utility.Location} that will be used to 170 # construct the locations of events as they are received. 171 __locationTemplate = None 172 173 # The callable that creates an instance of (a subclass of) 174 # L{SAXElementState} as required to hold element-specific information as 175 # parsing proceeds. 176 __elementStateConstructor = None 177 178 # The namespace to use when processing a document with an absent default 179 # namespace. 180 __fallbackNamespace = None 181 182 # The namespace context that will be in effect at the start of the 183 # next element. One of these is allocated at the start of each 184 # element; it moves to become the current namespace upon receipt 185 # of either the next element start or a namespace directive that 186 # will apply at that element start. 187 __nextNamespaceContext = None 188 189 # The namespace context that is in effect for this element.
190 - def namespaceContext (self):
191 """Return the namespace context used for QName resolution within the 192 current element. 193 194 @return: An instance of L{pyxb.namespace.resolution.NamespaceContext}""" 195 return self.__namespaceContext
196 __namespaceContext = None 197 198 # The namespace context in a schema that is including the schema to be 199 # parsed by this handler. This is necessary to handle section 4.2.1 when 200 # a schema with a non-absent target namespace includes a schema with no 201 # target namespace. 202 __includingContext = None 203 204 # A SAX locator object. @todo: Figure out how to associate the 205 # location information with the binding objects. 206 __locator = None 207 208 # The state for the element currently being processed
209 - def elementState (self):
210 return self.__elementState
211 __elementState = None 212 213 # The states for all enclosing elements 214 __elementStateStack = [] 215
216 - def rootObject (self):
217 """Return the binding object corresponding to the top-most 218 element in the document 219 220 @return: An instance of L{basis._TypeBinding_mixin} (most usually a 221 L{basis.complexTypeDefinition}.""" 222 return self.__rootObject
223 __rootObject = None 224
225 - def reset (self):
226 """Reset the state of the handler in preparation for processing a new 227 document. 228 229 @return: C{self} 230 """ 231 self.__namespaceContext = pyxb.namespace.resolution.NamespaceContext(default_namespace=self.__fallbackNamespace, 232 target_namespace=self.__targetNamespace, 233 including_context=self.__includingContext, 234 finalize_target_namespace=False) 235 self.__nextNamespaceContext = None 236 self.__elementState = self.__elementStateConstructor(namespace_context=self.__namespaceContext) 237 self.__elementStateStack = [] 238 self.__rootObject = None 239 # Note: setDocumentLocator is invoked before startDocument (which 240 # calls this), so this method should not reset it. 241 return self
242
243 - def __init__ (self, **kw):
244 """Create a new C{xml.sax.handler.ContentHandler} instance to maintain state relevant to elements. 245 246 @keyword fallback_namespace: Optional namespace to use for unqualified 247 names with no default namespace in scope. Has no effect unless it is 248 an absent namespace. 249 250 @keyword element_state_constructor: Optional callable object that 251 creates instances of L{SAXElementState} that hold element-specific 252 information. Defaults to L{SAXElementState}. 253 254 @keyword target_namespace: Optional namespace to set as the target 255 namespace. If not provided, there is no target namespace (not even an 256 absent one). This is the appropriate situation when processing plain 257 XML documents. 258 259 @keyword location_base: An object to be recorded as the base of all 260 L{pyxb.utils.utility.Location} instances associated with events and 261 objects handled by the parser. 262 """ 263 self.__includingContext = kw.pop('including_context', None) 264 self.__fallbackNamespace = kw.pop('fallback_namespace', None) 265 self.__elementStateConstructor = kw.pop('element_state_constructor', SAXElementState) 266 self.__targetNamespace = kw.pop('target_namespace', None) 267 self.__locationTemplate = pyxb.utils.utility.Location(kw.pop('location_base', None))
268 269 # If there's a new namespace waiting to be used, make it the 270 # current namespace. Return the current namespace.
271 - def __updateNamespaceContext (self):
272 if self.__nextNamespaceContext is not None: 273 self.__namespaceContext = self.__nextNamespaceContext 274 self.__nextNamespaceContext = None 275 return self.__namespaceContext
276
277 - def setDocumentLocator (self, locator):
278 """Save the locator object.""" 279 self.__locator = locator
280
281 - def startDocument (self):
282 """Process the start of a document. 283 284 This resets this handler for a new document. 285 @note: setDocumentLocator is invoked before startDocument 286 """ 287 self.reset()
288
289 - def startPrefixMapping (self, prefix, uri):
290 """Implement base class method. 291 292 @note: For this to be invoked, the C{feature_namespaces} feature must 293 be enabled in the SAX parser.""" 294 self.__updateNamespaceContext().processXMLNS(prefix, uri)
295 #print '%s PM %s %s' % (self.__namespaceContext, prefix, uri) 296 297 # The NamespaceContext management does not require any action upon 298 # leaving the scope of a namespace directive. 299 #def endPrefixMapping (self, prefix): 300 # pass 301
302 - def startElementNS (self, name, qname, attrs):
303 """Process the start of an element.""" 304 self.__flushPendingText() 305 306 # Get the context to be used for this element, and create a 307 # new context for the next contained element to be found. 308 ns_ctx = self.__updateNamespaceContext() 309 310 # Get the element name, which is already a tuple with the namespace assigned. 311 expanded_name = pyxb.namespace.ExpandedName(name, fallback_namespace=self.__fallbackNamespace) 312 313 tns_attr = pyxb.namespace.resolution.NamespaceContext._TargetNamespaceAttribute(expanded_name) 314 if tns_attr is not None: 315 # Not true for wsdl 316 #assert ns_ctx.targetNamespace() is None 317 ns_ctx.finalizeTargetNamespace(attrs.get(tns_attr.uriTuple()), including_context=self.__includingContext) 318 assert ns_ctx.targetNamespace() is not None 319 self.__nextNamespaceContext = pyxb.namespace.resolution.NamespaceContext(parent_context=ns_ctx) 320 321 # Save the state of the enclosing element, and create a new 322 # state for this element. 323 parent_state = self.__elementState 324 self.__elementStateStack.append(self.__elementState) 325 self.__elementState = this_state = self.__elementStateConstructor(expanded_name=expanded_name, 326 namespace_context=ns_ctx, 327 parent_state=parent_state, 328 location=self.__locationTemplate.newLocation(self.__locator)) 329 return (this_state, parent_state, ns_ctx, expanded_name)
330
331 - def endElementNS (self, name, qname):
332 """Process the completion of an element.""" 333 self.__flushPendingText() 334 335 # Save the state of this element, and restore the state for 336 # the parent to which we are returning. 337 this_state = self.__elementState 338 parent_state = self.__elementState = self.__elementStateStack.pop() 339 self.__nextNamespaceContext = None 340 self.__namespaceContext = parent_state.namespaceContext() 341 342 return this_state
343 344 # We accumulate consecutive text events into a single event, primarily to 345 # avoid the confusion that results when the value of a simple type is 346 # represented by multiple events, as with "B &amp; W". Also, it's faster 347 # to join them all at once, and to process one content value rather than a 348 # sequence of them. 349 __pendingText = None
350 - def __flushPendingText (self):
351 if self.__pendingText: 352 self.__elementState.addTextContent(''.join(self.__pendingText)) 353 self.__pendingText = []
354
355 - def characters (self, content):
356 """Save the text as content""" 357 self.__pendingText.append(content)
358
359 - def ignorableWhitespace (self, whitespace):
360 """Save whitespace as content too.""" 361 self.__pendingText.append(content)
362
363 - def processingInstruction (self, target, data):
364 self.__flushPendingText()
365 366 import StringIO
367 -class _EntityResolver (object):
368 """Dummy used to prevent the SAX parser from crashing when it sees 369 processing instructions that we dont' care about."""
370 - def resolveEntity (self, public_id, system_id):
371 return StringIO.StringIO('')
372
373 -def make_parser (*args, **kw):
374 """Extend C{xml.sax.make_parser} to configure the parser the way we 375 need it: 376 377 - C{feature_namespaces} is set to C{True} so we process xmlns 378 directives properly 379 - C{feature_namespace_prefixes} is set to C{False} so we don't get 380 prefixes encoded into our names (probably redundant with the above but 381 still...) 382 383 All arguments not documented here are passed to C{xml.sax.make_parser}. 384 385 All keywords not documented here (and C{fallback_namespace}, which is) are 386 passed to the C{content_handler_constructor} if that must be invoked. 387 388 @keyword content_handler: The content handler instance for the 389 parser to use. If not provided, an instance of C{content_handler_constructor} 390 is created and used. 391 @type content_handler: C{xml.sax.handler.ContentHandler} 392 393 @keyword content_handler_constructor: A callable which produces an 394 appropriate instance of (a subclass of) L{BaseSAXHandler}. The default is 395 L{BaseSAXHandler}. 396 397 @keyword fallback_namespace: The namespace to use for lookups of 398 unqualified names in absent namespaces; see 399 L{pyxb.namespace.ExpandedName}. This keyword is not used by this 400 function, but is passed to the C{content_handler_constructor}. 401 @type fallback_namespace: L{pyxb.namespace.Namespace} 402 """ 403 content_handler_constructor = kw.pop('content_handler_constructor', BaseSAXHandler) 404 content_handler = kw.pop('content_handler', None) 405 if content_handler is None: 406 content_handler = content_handler_constructor(**kw) 407 parser = xml.sax.make_parser(*args) 408 parser.setFeature(xml.sax.handler.feature_namespaces, True) 409 parser.setFeature(xml.sax.handler.feature_namespace_prefixes, False) 410 parser.setContentHandler(content_handler) 411 parser.setEntityResolver(_EntityResolver()) 412 return parser
413 414 if '__main__' == __name__: 415 import xml.dom.pulldom 416 import pyxb.utils.domutils as domutils 417 import pyxb.utils.saxdom as saxdom 418 import time 419 import lxml.sax 420 import lxml.etree 421 import StringIO 422 import sys 423 424 Handler = BaseSAXHandler 425 xml_file = '/home/pab/pyxb/dev/examples/tmsxtvd/tmsdatadirect_sample.xml' 426 if 1 < len(sys.argv): 427 xml_file = sys.argv[1] 428 xmls = open(xml_file).read() 429 430 dt1 = time.time() 431 dt2 = time.time() 432 dom = xml.dom.minidom.parseString(xmls) 433 dt3 = time.time() 434 435 snt1 = time.time() 436 saxer = make_parser(content_handler=_NoopSAXHandler()) 437 snt2 = time.time() 438 saxer.parse(StringIO.StringIO(xmls)) 439 snt3 = time.time() 440 441 sbt1 = time.time() 442 saxer = make_parser(content_handler=BaseSAXHandler()) 443 sbt2 = time.time() 444 saxer.parse(StringIO.StringIO(xmls)) 445 sbt3 = time.time() 446 447 pdt1 = time.time() 448 sdomer = make_parser(content_handler_constructor=saxdom._DOMSAXHandler) 449 h = sdomer.getContentHandler() 450 pdt2 = time.time() 451 sdomer.parse(StringIO.StringIO(xmls)) 452 pdt3 = time.time() 453 454 lst1 = time.time() 455 tree = lxml.etree.fromstring(xmls) 456 lst2 = time.time() 457 lsh = Handler() 458 lxml.sax.saxify(tree, lsh) 459 lst3 = time.time() 460 461 ldt1 = time.time() 462 tree = lxml.etree.fromstring(xmls) 463 ldt2 = time.time() 464 ldh = xml.dom.pulldom.SAX2DOM() 465 lxml.sax.saxify(tree, ldh) 466 ldt3 = time.time() 467 468 print 'minidom read %f, parse %f, total %f' % (dt2-dt1, dt3-dt2, dt3-dt1) 469 print 'SAX+noop create %f, parse %f, total %f' % (snt2-snt1, snt3-snt2, snt3-snt1) 470 print 'SAX+ns create %f, parse %f, total %f' % (sbt2-sbt1, sbt3-sbt2, sbt3-sbt1) 471 print 'PyXB SAXDOM-based create %f, parse %f, total %f' % (pdt2-pdt1, pdt3-pdt2, pdt3-pdt1) 472 print 'LXML+SAX tree %f, parse %f, total %f' % (lst2-lst1, lst3-lst2, lst3-lst1) 473 print 'LXML+pulldom DOM tree %f, parse %f, total %f' % (ldt2-ldt1, ldt3-ldt2, ldt3-ldt1) 474 475 ## Local Variables: 476 ## fill-column:78 477 ## End: 478