Package pyxb :: Package utils :: Module saxutils
[hide private]
[frames] | no frames]

Source Code for Module pyxb.utils.saxutils

  1  # Copyright 2009, Peter A. Bigot 
  2  # 
  3  # Licensed under the Apache License, Version 2.0 (the "License"); you may 
  4  # not use this file except in compliance with the License. You may obtain a 
  5  # copy of the License at: 
  6  # 
  7  #            http://www.apache.org/licenses/LICENSE-2.0 
  8  # 
  9  # Unless required by applicable law or agreed to in writing, software 
 10  # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 
 11  # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 
 12  # License for the specific language governing permissions and limitations 
 13  # under the License. 
 14   
 15  """This module contains support for processing XML using a SAX parser. 
 16   
 17  In particular, it provides a L{base content handler class<BaseSAXHandler>} 
 18  that maintains namespace context and element state in a stack; and a L{base 
 19  element state class <SAXElementState>} which records the location of the 
 20  element in the stream.  These classes are extended for specific parsing needs 
 21  (e.g., L{pyxb.binding.saxer}). 
 22  """ 
 23   
 24  import xml.sax 
 25  import xml.sax.handler 
 26  import pyxb.namespace 
 27   
28 -class TracingSAXHandler (xml.sax.handler.ContentHandler):
29 """A SAX handler class which prints each method invocation. 30 """ 31 32 # Whether invocation of handler methods should be traced 33 __trace = False 34
35 - def setDocumentLocator (self, locator):
36 print 'setDocumentLocator %s' % (locator,)
37
38 - def startDocument (self):
39 print 'startDocument'
40
41 - def startPrefixMapping (self, prefix, uri):
42 print 'startPrefixMapping %s %s' % (prefix, uri)
43
44 - def endPrefixMapping (self, prefix):
45 print 'endPrefixMapping %s' % (prefix,)
46
47 - def startElementNS (self, name, qname, attrs):
48 print 'startElementNS %s %s' % (name, qname)
49
50 - def endElementNS (self, name, qname):
51 print 'endElementNS %s %s' % (name, qname)
52
53 - def characters (self, content):
54 print 'characters %s' % (content,)
55
56 - def ignorableWhitespace (self, whitespace):
57 print 'ignorableWhitespace len %d' % (len(whitespace),)
58
59 - def processingInstruction (self, data):
60 print 'processingInstruction %s' % (data,)
61
62 -class _NoopSAXHandler (xml.sax.handler.ContentHandler):
63 """A SAX handler class which doesn't do anything. Used to get baseline 64 performance parsing a particular document. 65 """ 66
67 - def setDocumentLocator (self, locator):
68 pass
69
70 - def startDocument (self):
71 pass
72
73 - def startPrefixMapping (self, prefix, uri):
74 pass
75
76 - def endPrefixMapping (self, prefix):
77 pass
78
79 - def startElementNS (self, name, qname, attrs):
80 pass
81
82 - def endElementNS (self, name, qname):
83 pass
84
85 - def characters (self, content):
86 pass
87
88 - def ignorableWhitespace (self, whitespace):
89 pass
90
91 - def processingInstruction (self, data):
92 pass
93 94
95 -class SAXElementState (object):
96 """State corresponding to processing a given element with the SAX 97 model.""" 98
99 - def parentState (self):
100 """Reference to the SAXElementState of the element enclosing this 101 one.""" 102 return self.__parentState
103 __parentState = None 104
105 - def namespaceContext (self):
106 """The L{pyxb.namespace.resolution.NamespaceContext} used for this 107 binding.""" 108 return self.__namespaceContext
109 __namespaceContext = None 110
111 - def expandedName (self):
112 """The L{expanded name<pyxb.namespace.ExpandedName>} of the 113 element.""" 114 return self.__expandedName
115 __expandedName = None 116
117 - def location (self):
118 """The L{location<pyxb.utils.utility.Location>} corresponding to the 119 element event.""" 120 return self.__location
121 __location = None 122
123 - def content (self):
124 """An accumulation of content to be supplied to the content model when 125 the element end is reached. 126 127 This is a list, with each member being C{(content, element_use, 128 maybe_element)}. C{content} is text or a binding instance; 129 C{element_use} is C{None} or the 130 L{ElementUse<pyxb.binding.content.ElementUse>} instance used to create 131 the content; and C{maybe_element} is C{True} iff the content is 132 non-content text.""" 133 return self.__content
134 __content = None 135
136 - def __init__ (self, **kw):
137 self.__expandedName = kw.get('expanded_name', None) 138 self.__namespaceContext = kw['namespace_context'] 139 self.__parentState = kw.get('parent_state', None) 140 self.__location = kw.get('location', None) 141 self.__content = []
142
143 - def addTextContent (self, content):
144 """Add the given text as non-element content of the current element. 145 @type content: C{unicode} or C{str} 146 @return: C{self} 147 """ 148 self.__content.append( (content, None, False) )
149
150 - def addElementContent (self, element, element_use):
151 """Add the given binding instance as element content correspondidng to 152 the given use. 153 154 @param element: Any L{binding instance<pyxb.binding.basis._TypeBinding_mixin>}. 155 156 @param element_use: The L{element 157 use<pyxb.binding.content.ElementUse>} in the containing complex type. 158 """ 159 self.__content.append( (element, element_use, True) )
160
161 -class BaseSAXHandler (xml.sax.handler.ContentHandler, object):
162 """A SAX handler class that maintains a stack of enclosing elements and 163 manages namespace declarations. 164 165 This is the base for L{pyxb.utils.saxdom._DOMSAXHandler} and 166 L{pyxb.binding.saxer.PyXBSAXHandler}. 167 """ 168 169 # An instance of L{pyxb.utils.utility.Location} that will be used to 170 # construct the locations of events as they are received. 171 __locationTemplate = None 172 173 # The callable that creates an instance of (a subclass of) 174 # L{SAXElementState} as required to hold element-specific information as 175 # parsing proceeds. 176 __elementStateConstructor = None 177 178 # The namespace to use when processing a document with an absent default 179 # namespace.
180 - def fallbackNamespace (self):
181 """Return the namespace used to resolve unqualified names with no default namespace.""" 182 return self.__fallbackNamespace
183 __fallbackNamespace = None 184 185 # The namespace context that will be in effect at the start of the 186 # next element. One of these is allocated at the start of each 187 # element; it moves to become the current namespace upon receipt 188 # of either the next element start or a namespace directive that 189 # will apply at that element start. 190 __nextNamespaceContext = None 191 192 # The namespace context that is in effect for this element.
193 - def namespaceContext (self):
194 """Return the namespace context used for QName resolution within the 195 current element. 196 197 @return: An instance of L{pyxb.namespace.resolution.NamespaceContext}""" 198 return self.__namespaceContext
199 __namespaceContext = None 200 201 # The namespace context in a schema that is including the schema to be 202 # parsed by this handler. This is necessary to handle section 4.2.1 when 203 # a schema with a non-absent target namespace includes a schema with no 204 # target namespace. 205 __includingContext = None 206 207 # A SAX locator object. @todo: Figure out how to associate the 208 # location information with the binding objects. 209 __locator = None 210 211 # The state for the element currently being processed
212 - def elementState (self):
213 return self.__elementState
214 __elementState = None 215 216 # The states for all enclosing elements 217 __elementStateStack = [] 218
219 - def rootObject (self):
220 """Return the binding object corresponding to the top-most 221 element in the document 222 223 @return: An instance of L{basis._TypeBinding_mixin} (most usually a 224 L{basis.complexTypeDefinition}.""" 225 return self.__rootObject
226 __rootObject = None 227
228 - def reset (self):
229 """Reset the state of the handler in preparation for processing a new 230 document. 231 232 @return: C{self} 233 """ 234 self.__namespaceContext = pyxb.namespace.resolution.NamespaceContext(default_namespace=self.__fallbackNamespace, 235 target_namespace=self.__targetNamespace, 236 including_context=self.__includingContext, 237 finalize_target_namespace=False) 238 self.__nextNamespaceContext = None 239 self.__elementState = self.__elementStateConstructor(namespace_context=self.__namespaceContext) 240 self.__elementStateStack = [] 241 self.__rootObject = None 242 # Note: setDocumentLocator is invoked before startDocument (which 243 # calls this), so this method should not reset it. 244 return self
245
246 - def __init__ (self, **kw):
247 """Create a new C{xml.sax.handler.ContentHandler} instance to maintain state relevant to elements. 248 249 @keyword fallback_namespace: Optional namespace to use for unqualified 250 names with no default namespace in scope. Has no effect unless it is 251 an absent namespace. 252 253 @keyword element_state_constructor: Optional callable object that 254 creates instances of L{SAXElementState} that hold element-specific 255 information. Defaults to L{SAXElementState}. 256 257 @keyword target_namespace: Optional namespace to set as the target 258 namespace. If not provided, there is no target namespace (not even an 259 absent one). This is the appropriate situation when processing plain 260 XML documents. 261 262 @keyword location_base: An object to be recorded as the base of all 263 L{pyxb.utils.utility.Location} instances associated with events and 264 objects handled by the parser. 265 """ 266 self.__includingContext = kw.pop('including_context', None) 267 self.__fallbackNamespace = kw.pop('fallback_namespace', None) 268 self.__elementStateConstructor = kw.pop('element_state_constructor', SAXElementState) 269 self.__targetNamespace = kw.pop('target_namespace', None) 270 self.__locationTemplate = pyxb.utils.utility.Location(kw.pop('location_base', None))
271 272 # If there's a new namespace waiting to be used, make it the 273 # current namespace. Return the current namespace.
274 - def __updateNamespaceContext (self):
275 if self.__nextNamespaceContext is not None: 276 self.__namespaceContext = self.__nextNamespaceContext 277 self.__nextNamespaceContext = None 278 return self.__namespaceContext
279
280 - def setDocumentLocator (self, locator):
281 """Save the locator object.""" 282 self.__locator = locator
283
284 - def startDocument (self):
285 """Process the start of a document. 286 287 This resets this handler for a new document. 288 @note: setDocumentLocator is invoked before startDocument 289 """ 290 self.reset()
291
292 - def startPrefixMapping (self, prefix, uri):
293 """Implement base class method. 294 295 @note: For this to be invoked, the C{feature_namespaces} feature must 296 be enabled in the SAX parser.""" 297 self.__updateNamespaceContext().processXMLNS(prefix, uri)
298 #print '%s PM %s %s' % (self.__namespaceContext, prefix, uri) 299 300 # The NamespaceContext management does not require any action upon 301 # leaving the scope of a namespace directive. 302 #def endPrefixMapping (self, prefix): 303 # pass 304
305 - def startElementNS (self, name, qname, attrs):
306 """Process the start of an element.""" 307 self.__flushPendingText() 308 309 # Get the context to be used for this element, and create a 310 # new context for the next contained element to be found. 311 ns_ctx = self.__updateNamespaceContext() 312 313 # Get the element name, which is already a tuple with the namespace assigned. 314 expanded_name = pyxb.namespace.ExpandedName(name, fallback_namespace=self.__fallbackNamespace) 315 316 tns_attr = pyxb.namespace.resolution.NamespaceContext._TargetNamespaceAttribute(expanded_name) 317 if tns_attr is not None: 318 # Not true for wsdl 319 #assert ns_ctx.targetNamespace() is None 320 ns_ctx.finalizeTargetNamespace(attrs.get(tns_attr.uriTuple()), including_context=self.__includingContext) 321 assert ns_ctx.targetNamespace() is not None 322 self.__nextNamespaceContext = pyxb.namespace.resolution.NamespaceContext(parent_context=ns_ctx) 323 324 # Save the state of the enclosing element, and create a new 325 # state for this element. 326 parent_state = self.__elementState 327 self.__elementStateStack.append(self.__elementState) 328 self.__elementState = this_state = self.__elementStateConstructor(expanded_name=expanded_name, 329 namespace_context=ns_ctx, 330 parent_state=parent_state, 331 location=self.__locationTemplate.newLocation(self.__locator)) 332 return (this_state, parent_state, ns_ctx, expanded_name)
333
334 - def endElementNS (self, name, qname):
335 """Process the completion of an element.""" 336 self.__flushPendingText() 337 338 # Save the state of this element, and restore the state for 339 # the parent to which we are returning. 340 this_state = self.__elementState 341 parent_state = self.__elementState = self.__elementStateStack.pop() 342 self.__nextNamespaceContext = None 343 self.__namespaceContext = parent_state.namespaceContext() 344 345 return this_state
346 347 # We accumulate consecutive text events into a single event, primarily to 348 # avoid the confusion that results when the value of a simple type is 349 # represented by multiple events, as with "B &amp; W". Also, it's faster 350 # to join them all at once, and to process one content value rather than a 351 # sequence of them. 352 __pendingText = None
353 - def __flushPendingText (self):
354 if self.__pendingText: 355 self.__elementState.addTextContent(''.join(self.__pendingText)) 356 self.__pendingText = []
357
358 - def characters (self, content):
359 """Save the text as content""" 360 self.__pendingText.append(content)
361
362 - def ignorableWhitespace (self, whitespace):
363 """Save whitespace as content too.""" 364 self.__pendingText.append(content)
365
366 - def processingInstruction (self, target, data):
367 self.__flushPendingText()
368 369 import StringIO
370 -class _EntityResolver (object):
371 """Dummy used to prevent the SAX parser from crashing when it sees 372 processing instructions that we don't care about."""
373 - def resolveEntity (self, public_id, system_id):
374 return StringIO.StringIO('')
375 376 _CreateParserModules = []
377 -def SetCreateParserModules (create_parser_modules):
378 """Provide list of modules to be used when creating parsers. 379 380 C{xml.sax.make_parser()} takes as a parameter an optional list of modules 381 which allow customization of the parser to be used. Certain parsers have 382 better support for Unicode than others. 383 384 As an example, providing C{["drv_libxml2"]} causes the libxml2 parser to 385 be used. 386 387 The default behavior if this function is not called, or if it is called 388 with an empty list or C{None}, is to provide no specific modules, which 389 will result in the system default parser (probably expat). 390 391 @param create_parser_modules: an iterable list of names of modules that 392 provide a C{create_parser} function. Pass C{None} to reset to the system 393 default. """ 394 global _CreateParserModules 395 if create_parser_modules is None: 396 _CreateParserModules = [] 397 else: 398 _CreateParserModules = list(create_parser_modules)
399
400 -def make_parser (**kw):
401 """Extend C{xml.sax.make_parser} to configure the parser the way we 402 need it: 403 404 - C{feature_namespaces} is set to C{True} so we process xmlns 405 directives properly 406 - C{feature_namespace_prefixes} is set to C{False} so we don't get 407 prefixes encoded into our names (probably redundant with the above but 408 still...) 409 410 All keywords not documented here (and C{fallback_namespace}, which is) are 411 passed to the C{content_handler_constructor} if that must be invoked. 412 413 @keyword content_handler: The content handler instance for the 414 parser to use. If not provided, an instance of C{content_handler_constructor} 415 is created and used. 416 @type content_handler: C{xml.sax.handler.ContentHandler} 417 418 @keyword content_handler_constructor: A callable which produces an 419 appropriate instance of (a subclass of) L{BaseSAXHandler}. The default is 420 L{BaseSAXHandler}. 421 422 @keyword fallback_namespace: The namespace to use for lookups of 423 unqualified names in absent namespaces; see 424 L{pyxb.namespace.ExpandedName}. This keyword is not used by this 425 function, but is passed to the C{content_handler_constructor}. 426 @type fallback_namespace: L{pyxb.namespace.Namespace} 427 """ 428 content_handler_constructor = kw.pop('content_handler_constructor', BaseSAXHandler) 429 content_handler = kw.pop('content_handler', None) 430 if content_handler is None: 431 content_handler = content_handler_constructor(**kw) 432 parser = xml.sax.make_parser(_CreateParserModules) 433 parser.setFeature(xml.sax.handler.feature_namespaces, True) 434 parser.setFeature(xml.sax.handler.feature_namespace_prefixes, False) 435 parser.setContentHandler(content_handler) 436 # libxml2 doesn't support this feature 437 try: 438 parser.setEntityResolver(_EntityResolver()) 439 except xml.sax.SAXNotSupportedException: 440 pass 441 return parser
442 443 if '__main__' == __name__: 444 import xml.dom.pulldom 445 import pyxb.utils.domutils as domutils 446 import pyxb.utils.saxdom as saxdom 447 import time 448 import lxml.sax 449 import lxml.etree 450 import StringIO 451 import sys 452 453 Handler = BaseSAXHandler 454 xml_file = 'examples/tmsxtvd/tmsdatadirect_sample.xml' 455 if 1 < len(sys.argv): 456 xml_file = sys.argv[1] 457 xmls = open(xml_file).read() 458 459 dt1 = time.time() 460 dt2 = time.time() 461 dom = xml.dom.minidom.parseString(xmls) 462 dt3 = time.time() 463 464 snt1 = time.time() 465 saxer = make_parser(content_handler=_NoopSAXHandler()) 466 snt2 = time.time() 467 saxer.parse(StringIO.StringIO(xmls)) 468 snt3 = time.time() 469 470 sbt1 = time.time() 471 saxer = make_parser(content_handler=BaseSAXHandler()) 472 sbt2 = time.time() 473 saxer.parse(StringIO.StringIO(xmls)) 474 sbt3 = time.time() 475 476 pdt1 = time.time() 477 sdomer = make_parser(content_handler_constructor=saxdom._DOMSAXHandler) 478 h = sdomer.getContentHandler() 479 pdt2 = time.time() 480 sdomer.parse(StringIO.StringIO(xmls)) 481 pdt3 = time.time() 482 483 lst1 = time.time() 484 tree = lxml.etree.fromstring(xmls) 485 lst2 = time.time() 486 lsh = Handler() 487 lxml.sax.saxify(tree, lsh) 488 lst3 = time.time() 489 490 ldt1 = time.time() 491 tree = lxml.etree.fromstring(xmls) 492 ldt2 = time.time() 493 ldh = xml.dom.pulldom.SAX2DOM() 494 lxml.sax.saxify(tree, ldh) 495 ldt3 = time.time() 496 497 print 'minidom read %f, parse %f, total %f' % (dt2-dt1, dt3-dt2, dt3-dt1) 498 print 'SAX+noop create %f, parse %f, total %f' % (snt2-snt1, snt3-snt2, snt3-snt1) 499 print 'SAX+ns create %f, parse %f, total %f' % (sbt2-sbt1, sbt3-sbt2, sbt3-sbt1) 500 print 'PyXB SAXDOM-based create %f, parse %f, total %f' % (pdt2-pdt1, pdt3-pdt2, pdt3-pdt1) 501 print 'LXML+SAX tree %f, parse %f, total %f' % (lst2-lst1, lst3-lst2, lst3-lst1) 502 print 'LXML+pulldom DOM tree %f, parse %f, total %f' % (ldt2-ldt1, ldt3-ldt2, ldt3-ldt1) 503 504 ## Local Variables: 505 ## fill-column:78 506 ## End: 507