Package pyxb :: Package utils :: Module saxutils
[hide private]
[frames] | no frames]

Source Code for Module pyxb.utils.saxutils

  1  # -*- coding: utf-8 -*- 
  2  # Copyright 2009-2012, Peter A. Bigot 
  3  # 
  4  # Licensed under the Apache License, Version 2.0 (the "License"); you may 
  5  # not use this file except in compliance with the License. You may obtain a 
  6  # copy of the License at: 
  7  # 
  8  #            http://www.apache.org/licenses/LICENSE-2.0 
  9  # 
 10  # Unless required by applicable law or agreed to in writing, software 
 11  # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 
 12  # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 
 13  # License for the specific language governing permissions and limitations 
 14  # under the License. 
 15   
 16  """This module contains support for processing XML using a SAX parser. 
 17   
 18  In particular, it provides a L{base content handler class<BaseSAXHandler>} 
 19  that maintains namespace context and element state in a stack; and a L{base 
 20  element state class <SAXElementState>} which records the location of the 
 21  element in the stream.  These classes are extended for specific parsing needs 
 22  (e.g., L{pyxb.binding.saxer}). 
 23  """ 
 24   
 25  import xml.sax 
 26  import xml.sax.handler 
 27  import pyxb.namespace 
 28  import StringIO 
 29  import logging 
 30   
 31  _log = logging.getLogger(__name__) 
 32   
33 -class TracingSAXHandler (xml.sax.handler.ContentHandler):
34 """A SAX handler class which prints each method invocation. 35 """ 36 37 # Whether invocation of handler methods should be traced 38 __trace = False 39
40 - def setDocumentLocator (self, locator):
41 _log.debug('setDocumentLocator %s', locator)
42
43 - def startDocument (self):
44 _log.debug('startDocument')
45
46 - def startPrefixMapping (self, prefix, uri):
47 _log.debug('startPrefixMapping %s %s', prefix, uri)
48
49 - def endPrefixMapping (self, prefix):
50 _log.debug('endPrefixMapping %s', prefix)
51
52 - def startElementNS (self, name, qname, attrs):
53 _log.debug('startElementNS %s %s', name, qname)
54
55 - def endElementNS (self, name, qname):
56 _log.debug('endElementNS %s %s', name, qname)
57
58 - def characters (self, content):
59 _log.debug('characters %s', content)
60
61 - def ignorableWhitespace (self, whitespace):
62 _log.debug('ignorableWhitespace len %d', len(whitespace))
63
64 - def processingInstruction (self, target, data):
65 _log.debug('processingInstruction %s %s', target, data)
66
67 -class _NoopSAXHandler (xml.sax.handler.ContentHandler):
68 """A SAX handler class which doesn't do anything. Used to get baseline 69 performance parsing a particular document. 70 """ 71
72 - def setDocumentLocator (self, locator):
73 pass
74
75 - def startDocument (self):
76 pass
77
78 - def startPrefixMapping (self, prefix, uri):
79 pass
80
81 - def endPrefixMapping (self, prefix):
82 pass
83
84 - def startElementNS (self, name, qname, attrs):
85 pass
86
87 - def endElementNS (self, name, qname):
88 pass
89
90 - def characters (self, content):
91 pass
92
93 - def ignorableWhitespace (self, whitespace):
94 pass
95
96 - def processingInstruction (self, target, data):
97 pass
98 99
100 -class SAXElementState (object):
101 """State corresponding to processing a given element with the SAX 102 model.""" 103
104 - def parentState (self):
105 """Reference to the SAXElementState of the element enclosing this 106 one.""" 107 return self.__parentState
108 __parentState = None 109
110 - def namespaceContext (self):
111 """The L{pyxb.namespace.resolution.NamespaceContext} used for this 112 binding.""" 113 return self.__namespaceContext
114 __namespaceContext = None 115
116 - def expandedName (self):
117 """The L{expanded name<pyxb.namespace.ExpandedName>} of the 118 element.""" 119 return self.__expandedName
120 __expandedName = None 121
122 - def location (self):
123 """The L{location<pyxb.utils.utility.Location>} corresponding to the 124 element event.""" 125 return self.__location
126 __location = None 127
128 - def content (self):
129 """An accumulation of content to be supplied to the content model when 130 the element end is reached. 131 132 This is a list, with each member being C{(content, element_use, 133 maybe_element)}. C{content} is text or a binding instance; 134 C{element_use} is C{None} or the 135 L{ElementUse<pyxb.binding.content.ElementUse>} instance used to create 136 the content; and C{maybe_element} is C{True} iff the content is 137 non-content text.""" 138 return self.__content
139 __content = None 140
141 - def __init__ (self, **kw):
142 self.__expandedName = kw.get('expanded_name', None) 143 self.__namespaceContext = kw['namespace_context'] 144 self.__parentState = kw.get('parent_state', None) 145 self.__location = kw.get('location', None) 146 self.__content = []
147
148 - def addTextContent (self, content):
149 """Add the given text as non-element content of the current element. 150 @type content: C{unicode} or C{str} 151 @return: C{self} 152 """ 153 self.__content.append( (content, None, False) )
154
155 - def addElementContent (self, element, element_use):
156 """Add the given binding instance as element content correspondidng to 157 the given use. 158 159 @param element: Any L{binding instance<pyxb.binding.basis._TypeBinding_mixin>}. 160 161 @param element_use: The L{element 162 use<pyxb.binding.content.ElementUse>} in the containing complex type. 163 """ 164 self.__content.append( (element, element_use, True) )
165
166 -class BaseSAXHandler (xml.sax.handler.ContentHandler, object):
167 """A SAX handler class that maintains a stack of enclosing elements and 168 manages namespace declarations. 169 170 This is the base for L{pyxb.utils.saxdom._DOMSAXHandler} and 171 L{pyxb.binding.saxer.PyXBSAXHandler}. 172 """ 173 174 # An instance of L{pyxb.utils.utility.Location} that will be used to 175 # construct the locations of events as they are received. 176 __locationTemplate = None 177 178 # The callable that creates an instance of (a subclass of) 179 # L{SAXElementState} as required to hold element-specific information as 180 # parsing proceeds. 181 __elementStateConstructor = None 182 183 # The namespace to use when processing a document with an absent default 184 # namespace.
185 - def fallbackNamespace (self):
186 """Return the namespace used to resolve unqualified names with no default namespace.""" 187 return self.__fallbackNamespace
188 __fallbackNamespace = None 189 190 # The namespace context that will be in effect at the start of the 191 # next element. One of these is allocated at the start of each 192 # element; it moves to become the current namespace upon receipt 193 # of either the next element start or a namespace directive that 194 # will apply at that element start. 195 __nextNamespaceContext = None 196 197 # The namespace context that is in effect for this element.
198 - def namespaceContext (self):
199 """Return the namespace context used for QName resolution within the 200 current element. 201 202 @return: An instance of L{pyxb.namespace.resolution.NamespaceContext}""" 203 return self.__namespaceContext
204 __namespaceContext = None 205 206 # The namespace context in a schema that is including the schema to be 207 # parsed by this handler. This is necessary to handle section 4.2.1 when 208 # a schema with a non-absent target namespace includes a schema with no 209 # target namespace. 210 __includingContext = None 211 212 # A SAX locator object. @todo: Figure out how to associate the 213 # location information with the binding objects. 214 __locator = None 215 216 # The state for the element currently being processed
217 - def elementState (self):
218 return self.__elementState
219 __elementState = None 220 221 # The states for all enclosing elements 222 __elementStateStack = [] 223
224 - def rootObject (self):
225 """Return the binding object corresponding to the top-most 226 element in the document 227 228 @return: An instance of L{basis._TypeBinding_mixin} (most usually a 229 L{basis.complexTypeDefinition}.""" 230 return self.__rootObject
231 __rootObject = None 232
233 - def reset (self):
234 """Reset the state of the handler in preparation for processing a new 235 document. 236 237 @return: C{self} 238 """ 239 self.__namespaceContext = pyxb.namespace.resolution.NamespaceContext(default_namespace=self.__fallbackNamespace, 240 target_namespace=self.__targetNamespace, 241 including_context=self.__includingContext, 242 finalize_target_namespace=False) 243 self.__nextNamespaceContext = None 244 self.__elementState = self.__elementStateConstructor(namespace_context=self.__namespaceContext) 245 self.__elementStateStack = [] 246 self.__rootObject = None 247 # Note: setDocumentLocator is invoked before startDocument (which 248 # calls this), so this method should not reset it. 249 return self
250
251 - def __init__ (self, **kw):
252 """Create a new C{xml.sax.handler.ContentHandler} instance to maintain state relevant to elements. 253 254 @keyword fallback_namespace: Optional namespace to use for unqualified 255 names with no default namespace in scope. Has no effect unless it is 256 an absent namespace. 257 258 @keyword element_state_constructor: Optional callable object that 259 creates instances of L{SAXElementState} that hold element-specific 260 information. Defaults to L{SAXElementState}. 261 262 @keyword target_namespace: Optional namespace to set as the target 263 namespace. If not provided, there is no target namespace (not even an 264 absent one). This is the appropriate situation when processing plain 265 XML documents. 266 267 @keyword location_base: An object to be recorded as the base of all 268 L{pyxb.utils.utility.Location} instances associated with events and 269 objects handled by the parser. 270 """ 271 self.__includingContext = kw.pop('including_context', None) 272 self.__fallbackNamespace = kw.pop('fallback_namespace', None) 273 self.__elementStateConstructor = kw.pop('element_state_constructor', SAXElementState) 274 self.__targetNamespace = kw.pop('target_namespace', None) 275 self.__locationTemplate = pyxb.utils.utility.Location(kw.pop('location_base', None))
276 277 # If there's a new namespace waiting to be used, make it the 278 # current namespace. Return the current namespace.
279 - def __updateNamespaceContext (self):
280 if self.__nextNamespaceContext is not None: 281 self.__namespaceContext = self.__nextNamespaceContext 282 self.__nextNamespaceContext = None 283 return self.__namespaceContext
284
285 - def setDocumentLocator (self, locator):
286 """Save the locator object.""" 287 self.__locator = locator
288
289 - def startDocument (self):
290 """Process the start of a document. 291 292 This resets this handler for a new document. 293 @note: setDocumentLocator is invoked before startDocument 294 """ 295 self.reset()
296
297 - def startPrefixMapping (self, prefix, uri):
298 """Implement base class method. 299 300 @note: For this to be invoked, the C{feature_namespaces} feature must 301 be enabled in the SAX parser.""" 302 self.__updateNamespaceContext().processXMLNS(prefix, uri)
303 304 # The NamespaceContext management does not require any action upon 305 # leaving the scope of a namespace directive. 306 #def endPrefixMapping (self, prefix): 307 # pass 308
309 - def startElementNS (self, name, qname, attrs):
310 """Process the start of an element.""" 311 self.__flushPendingText() 312 313 # Get the context to be used for this element, and create a 314 # new context for the next contained element to be found. 315 ns_ctx = self.__updateNamespaceContext() 316 317 # Get the element name, which is already a tuple with the namespace assigned. 318 expanded_name = pyxb.namespace.ExpandedName(name, fallback_namespace=self.__fallbackNamespace) 319 320 tns_attr = pyxb.namespace.resolution.NamespaceContext._TargetNamespaceAttribute(expanded_name) 321 if tns_attr is not None: 322 # Not true for wsdl 323 #assert ns_ctx.targetNamespace() is None 324 ns_ctx.finalizeTargetNamespace(attrs.get(tns_attr.uriTuple()), including_context=self.__includingContext) 325 assert ns_ctx.targetNamespace() is not None 326 self.__nextNamespaceContext = pyxb.namespace.resolution.NamespaceContext(parent_context=ns_ctx) 327 328 # Save the state of the enclosing element, and create a new 329 # state for this element. 330 parent_state = self.__elementState 331 self.__elementStateStack.append(self.__elementState) 332 self.__elementState = this_state = self.__elementStateConstructor(expanded_name=expanded_name, 333 namespace_context=ns_ctx, 334 parent_state=parent_state, 335 location=self.__locationTemplate.newLocation(self.__locator)) 336 return (this_state, parent_state, ns_ctx, expanded_name)
337
338 - def endElementNS (self, name, qname):
339 """Process the completion of an element.""" 340 self.__flushPendingText() 341 342 # Save the state of this element, and restore the state for 343 # the parent to which we are returning. 344 this_state = self.__elementState 345 parent_state = self.__elementState = self.__elementStateStack.pop() 346 self.__nextNamespaceContext = None 347 self.__namespaceContext = parent_state.namespaceContext() 348 349 return this_state
350 351 # We accumulate consecutive text events into a single event, primarily to 352 # avoid the confusion that results when the value of a simple type is 353 # represented by multiple events, as with "B &amp; W". Also, it's faster 354 # to join them all at once, and to process one content value rather than a 355 # sequence of them. 356 __pendingText = None
357 - def __flushPendingText (self):
358 if self.__pendingText: 359 self.__elementState.addTextContent(''.join(self.__pendingText)) 360 self.__pendingText = []
361
362 - def characters (self, content):
363 """Save the text as content""" 364 self.__pendingText.append(content)
365
366 - def ignorableWhitespace (self, whitespace):
367 """Save whitespace as content too.""" 368 self.__pendingText.append(whitespace)
369
370 - def processingInstruction (self, target, data):
371 self.__flushPendingText()
372
373 -class _EntityResolver (object):
374 """Dummy used to prevent the SAX parser from crashing when it sees 375 processing instructions that we don't care about."""
376 - def resolveEntity (self, public_id, system_id):
377 return StringIO.StringIO('')
378 379 _CreateParserModules = []
380 -def SetCreateParserModules (create_parser_modules):
381 """Provide list of modules to be used when creating parsers. 382 383 C{xml.sax.make_parser()} takes as a parameter an optional list of modules 384 which allow customization of the parser to be used. Certain parsers have 385 better support for Unicode than others. 386 387 As an example, providing C{["drv_libxml2"]} causes the libxml2 parser to 388 be used. 389 390 The default behavior if this function is not called, or if it is called 391 with an empty list or C{None}, is to provide no specific modules, which 392 will result in the system default parser (probably expat). 393 394 @param create_parser_modules: an iterable list of names of modules that 395 provide a C{create_parser} function. Pass C{None} to reset to the system 396 default. """ 397 global _CreateParserModules 398 if create_parser_modules is None: 399 _CreateParserModules = [] 400 else: 401 _CreateParserModules = list(create_parser_modules)
402
403 -def make_parser (**kw):
404 """Extend C{xml.sax.make_parser} to configure the parser the way we 405 need it: 406 407 - C{feature_namespaces} is set to C{True} so we process xmlns 408 directives properly 409 - C{feature_namespace_prefixes} is set to C{False} so we don't get 410 prefixes encoded into our names (probably redundant with the above but 411 still...) 412 413 All keywords not documented here (and C{fallback_namespace}, which is) are 414 passed to the C{content_handler_constructor} if that must be invoked. 415 416 @keyword content_handler: The content handler instance for the 417 parser to use. If not provided, an instance of C{content_handler_constructor} 418 is created and used. 419 @type content_handler: C{xml.sax.handler.ContentHandler} 420 421 @keyword content_handler_constructor: A callable which produces an 422 appropriate instance of (a subclass of) L{BaseSAXHandler}. The default is 423 L{BaseSAXHandler}. 424 425 @keyword fallback_namespace: The namespace to use for lookups of 426 unqualified names in absent namespaces; see 427 L{pyxb.namespace.ExpandedName}. This keyword is not used by this 428 function, but is passed to the C{content_handler_constructor}. 429 @type fallback_namespace: L{pyxb.namespace.Namespace} 430 """ 431 content_handler_constructor = kw.pop('content_handler_constructor', BaseSAXHandler) 432 content_handler = kw.pop('content_handler', None) 433 if content_handler is None: 434 content_handler = content_handler_constructor(**kw) 435 parser = xml.sax.make_parser(_CreateParserModules) 436 parser.setFeature(xml.sax.handler.feature_namespaces, True) 437 parser.setFeature(xml.sax.handler.feature_namespace_prefixes, False) 438 parser.setContentHandler(content_handler) 439 # libxml2 doesn't support this feature 440 try: 441 parser.setEntityResolver(_EntityResolver()) 442 except xml.sax.SAXNotSupportedException: 443 pass 444 return parser
445 446 if '__main__' == __name__: 447 import xml.dom.pulldom 448 import pyxb.utils.saxdom as saxdom 449 import time 450 import lxml.sax 451 import lxml.etree 452 import sys 453 454 Handler = BaseSAXHandler 455 xml_file = 'examples/tmsxtvd/tmsdatadirect_sample.xml' 456 if 1 < len(sys.argv): 457 xml_file = sys.argv[1] 458 xmls = open(xml_file).read() 459 460 dt1 = time.time() 461 dt2 = time.time() 462 dom = xml.dom.minidom.parseString(xmls) 463 dt3 = time.time() 464 465 snt1 = time.time() 466 saxer = make_parser(content_handler=_NoopSAXHandler()) 467 snt2 = time.time() 468 saxer.parse(StringIO.StringIO(xmls)) 469 snt3 = time.time() 470 471 sbt1 = time.time() 472 saxer = make_parser(content_handler=BaseSAXHandler()) 473 sbt2 = time.time() 474 saxer.parse(StringIO.StringIO(xmls)) 475 sbt3 = time.time() 476 477 pdt1 = time.time() 478 sdomer = make_parser(content_handler_constructor=saxdom._DOMSAXHandler) 479 h = sdomer.getContentHandler() 480 pdt2 = time.time() 481 sdomer.parse(StringIO.StringIO(xmls)) 482 pdt3 = time.time() 483 484 lst1 = time.time() 485 tree = lxml.etree.fromstring(xmls) 486 lst2 = time.time() 487 lsh = Handler() 488 lxml.sax.saxify(tree, lsh) 489 lst3 = time.time() 490 491 ldt1 = time.time() 492 tree = lxml.etree.fromstring(xmls) 493 ldt2 = time.time() 494 ldh = xml.dom.pulldom.SAX2DOM() 495 lxml.sax.saxify(tree, ldh) 496 ldt3 = time.time() 497 498 print 'minidom read %f, parse %f, total %f' % (dt2-dt1, dt3-dt2, dt3-dt1) 499 print 'SAX+noop create %f, parse %f, total %f' % (snt2-snt1, snt3-snt2, snt3-snt1) 500 print 'SAX+ns create %f, parse %f, total %f' % (sbt2-sbt1, sbt3-sbt2, sbt3-sbt1) 501 print 'PyXB SAXDOM-based create %f, parse %f, total %f' % (pdt2-pdt1, pdt3-pdt2, pdt3-pdt1) 502 print 'LXML+SAX tree %f, parse %f, total %f' % (lst2-lst1, lst3-lst2, lst3-lst1) 503 print 'LXML+pulldom DOM tree %f, parse %f, total %f' % (ldt2-ldt1, ldt3-ldt2, ldt3-ldt1) 504 505 ## Local Variables: 506 ## fill-column:78 507 ## End: 508