Package pyxb :: Package utils :: Module saxutils
[hide private]
[frames] | no frames]

Source Code for Module pyxb.utils.saxutils

  1  # -*- coding: utf-8 -*- 
  2  # Copyright 2009-2013, Peter A. Bigot 
  3  # 
  4  # Licensed under the Apache License, Version 2.0 (the "License"); you may 
  5  # not use this file except in compliance with the License. You may obtain a 
  6  # copy of the License at: 
  7  # 
  8  #            http://www.apache.org/licenses/LICENSE-2.0 
  9  # 
 10  # Unless required by applicable law or agreed to in writing, software 
 11  # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 
 12  # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 
 13  # License for the specific language governing permissions and limitations 
 14  # under the License. 
 15   
 16  """This module contains support for processing XML using a SAX parser. 
 17   
 18  In particular, it provides a L{base content handler class<BaseSAXHandler>} 
 19  that maintains namespace context and element state in a stack; and a L{base 
 20  element state class <SAXElementState>} which records the location of the 
 21  element in the stream.  These classes are extended for specific parsing needs 
 22  (e.g., L{pyxb.binding.saxer}). 
 23  """ 
 24   
 25  from __future__ import print_function 
 26  import xml.sax 
 27  import xml.sax.handler 
 28  import pyxb.namespace 
 29  import io 
 30  import logging 
 31   
 32  _log = logging.getLogger(__name__) 
 33   
34 -class TracingSAXHandler (xml.sax.handler.ContentHandler):
35 """A SAX handler class which prints each method invocation. 36 """ 37 38 # Whether invocation of handler methods should be traced 39 __trace = False 40
41 - def setDocumentLocator (self, locator):
42 print('setDocumentLocator %s' % (locator,))
43
44 - def startDocument (self):
45 print('startDocument')
46
47 - def startPrefixMapping (self, prefix, uri):
48 print('startPrefixMapping %s %s' % (prefix, uri))
49
50 - def endPrefixMapping (self, prefix):
51 print('endPrefixMapping %s' % (prefix,))
52
53 - def startElementNS (self, name, qname, attrs):
54 print('startElementNS %s %s' % (name, qname))
55
56 - def endElementNS (self, name, qname):
57 print('endElementNS %s %s' % (name, qname))
58
59 - def characters (self, content):
60 print('characters %s' % (content,))
61
62 - def ignorableWhitespace (self, whitespace):
63 print('ignorableWhitespace len %d' % (len(whitespace),))
64
65 - def processingInstruction (self, target, data):
66 print('processingInstruction %s %s' % (target, data))
67
68 -class _NoopSAXHandler (xml.sax.handler.ContentHandler):
69 """A SAX handler class which doesn't do anything. Used to get baseline 70 performance parsing a particular document. 71 """ 72
73 - def setDocumentLocator (self, locator):
74 pass
75
76 - def startDocument (self):
77 pass
78
79 - def startPrefixMapping (self, prefix, uri):
80 pass
81
82 - def endPrefixMapping (self, prefix):
83 pass
84
85 - def startElementNS (self, name, qname, attrs):
86 pass
87
88 - def endElementNS (self, name, qname):
89 pass
90
91 - def characters (self, content):
92 pass
93
94 - def ignorableWhitespace (self, whitespace):
95 pass
96
97 - def processingInstruction (self, target, data):
98 pass
99
100 -class SAXInformationItem (object):
101 """Class used to capture an item discovered in the body of an element.""" 102 103 location = None 104 """Where the item began in the document.""" 105 106 item = None 107 """The item. Generally either character information (as text) or a DOM 108 Node instance or a binding instance.""" 109 110 maybe_element = None 111 """C{False} iff the L{item} is character information as opposed to element content.""" 112 113 element_decl = None 114 """A reference to the 115 L{ElementDeclaration<pyxb.binding.content.ElementDeclaration>} used for 116 the L{item}. This will be C{None} for element content that does not have 117 an enclosing CTD scope.""" 118
119 - def __init__ (self, location, item, maybe_element, element_decl=None):
120 self.location = location 121 self.item = item 122 self.maybe_element = maybe_element 123 self.element_decl = element_decl
124
125 -class SAXElementState (object):
126 """State corresponding to processing a given element with the SAX 127 model.""" 128
129 - def contentHandler (self):
130 """Reference to the C{xml.sxa.handler.ContentHandler} that is processing the document.""" 131 return self.__contentHandler
132 __contentHandler = None 133
134 - def parentState (self):
135 """Reference to the SAXElementState of the element enclosing this 136 one.""" 137 return self.__parentState
138 __parentState = None 139
140 - def namespaceContext (self):
141 """The L{pyxb.namespace.resolution.NamespaceContext} used for this 142 binding.""" 143 return self.__namespaceContext
144 __namespaceContext = None 145
146 - def expandedName (self):
147 """The L{expanded name<pyxb.namespace.ExpandedName>} of the 148 element.""" 149 return self.__expandedName
150 __expandedName = None 151
152 - def location (self):
153 """The L{location<pyxb.utils.utility.Location>} corresponding to the 154 element event.""" 155 return self.__location
156 __location = None 157
158 - def content (self):
159 """An accumulation of content to be supplied to the content model when 160 the element end is reached. 161 162 This is a list, with each member being C{(content, element_use, 163 maybe_element)}. C{content} is text or a binding instance; 164 C{element_use} is C{None} or the 165 L{ElementDeclaration<pyxb.binding.content.ElementDeclaration>} instance used to create 166 the content; and C{maybe_element} is C{True} iff the content is 167 non-content text.""" 168 return self.__content
169 __content = None 170
171 - def __init__ (self, **kw):
172 self.__expandedName = kw.get('expanded_name') 173 self.__namespaceContext = kw['namespace_context'] 174 self.__parentState = kw.get('parent_state') 175 self.__contentHandler = kw.get('content_handler') 176 assert self.__contentHandler is not None 177 self.__location = self.__contentHandler.location() 178 self.__content = []
179
180 - def addTextContent (self, location, content):
181 """Add the given text as non-element content of the current element. 182 @type content: C{unicode} or C{str} 183 @return: C{self} 184 """ 185 self.__content.append(SAXInformationItem(location, content, False))
186
187 - def addElementContent (self, location, element, element_decl=None):
188 """Add the given binding instance as element content corresponding to 189 the given use. 190 191 @param element: Any L{binding instance<pyxb.binding.basis._TypeBinding_mixin>}. 192 193 @param element_decl: The L{element 194 use<pyxb.binding.content.ElementDeclaration>} in the containing complex type. 195 """ 196 self.__content.append(SAXInformationItem(location, element, True, element_decl))
197
198 -class BaseSAXHandler (xml.sax.handler.ContentHandler, object):
199 """A SAX handler class that maintains a stack of enclosing elements and 200 manages namespace declarations. 201 202 This is the base for L{pyxb.utils.saxdom._DOMSAXHandler} and 203 L{pyxb.binding.saxer.PyXBSAXHandler}. 204 """ 205 206 # An instance of L{pyxb.utils.utility.Location} that will be used to 207 # construct the locations of events as they are received. 208 __locationTemplate = None 209
210 - def location (self):
211 """Return the current location within the SAX-processed document.""" 212 return self.__locationTemplate.newLocation(self.__locator)
213 214 # The callable that creates an instance of (a subclass of) 215 # L{SAXElementState} as required to hold element-specific information as 216 # parsing proceeds. 217 __elementStateConstructor = None 218 219 # The namespace to use when processing a document with an absent default 220 # namespace.
221 - def fallbackNamespace (self):
222 """Return the namespace used to resolve unqualified names with no default namespace.""" 223 return self.__fallbackNamespace
224 __fallbackNamespace = None 225 226 # The namespace context that will be in effect at the start of the next 227 # element, or C{None} if no namespace directive notifications have been 228 # received since the last element start or end. Namespace directive 229 # notifications are received before the notification of element start in 230 # which they apply, and cause a "next namespace context" to be allocated 231 # referencing the current namespace. The directive is applied to the next 232 # context. A non-None next context becomes active on entry to the next 233 # element. The next context is reset to None on entry to and exit from an 234 # element so subsequent new directives are applied to a fresh context 235 # inherited from the current context. 236 __nextNamespaceContext = None 237 238 # The namespace context that is in effect for this element.
239 - def namespaceContext (self):
240 """Return the namespace context used for QName resolution within the 241 current element. 242 243 @return: An instance of L{pyxb.namespace.resolution.NamespaceContext}""" 244 return self.__namespaceContext
245 __namespaceContext = None 246 247 # The namespace context in a schema that is including the schema to be 248 # parsed by this handler. This is necessary to handle section 4.2.1 when 249 # a schema with a non-absent target namespace includes a schema with no 250 # target namespace. 251 __includingContext = None 252 253 # A SAX locator object. @todo: Figure out how to associate the 254 # location information with the binding objects. 255 __locator = None 256 257 # The state for the element currently being processed
258 - def elementState (self):
259 return self.__elementState
260 __elementState = None 261 262 # The states for all enclosing elements 263 __elementStateStack = [] 264
265 - def rootObject (self):
266 """Return the binding object corresponding to the top-most 267 element in the document 268 269 @return: An instance of L{basis._TypeBinding_mixin} (most usually a 270 L{basis.complexTypeDefinition}.""" 271 return self.__rootObject
272 __rootObject = None 273
274 - def reset (self):
275 """Reset the state of the handler in preparation for processing a new 276 document. 277 278 @return: C{self} 279 """ 280 self.__namespaceContext = pyxb.namespace.resolution.NamespaceContext(default_namespace=self.__fallbackNamespace, 281 target_namespace=self.__targetNamespace, 282 including_context=self.__includingContext, 283 finalize_target_namespace=False) 284 self.__nextNamespaceContext = None 285 self.__elementState = self.__elementStateConstructor(content_handler=self, 286 namespace_context=self.__namespaceContext) 287 self.__elementStateStack = [] 288 self.__rootObject = None 289 # Note: setDocumentLocator is invoked before startDocument (which 290 # calls this), so this method should not reset it. 291 return self
292
293 - def __init__ (self, **kw):
294 """Create a new C{xml.sax.handler.ContentHandler} instance to maintain state relevant to elements. 295 296 @keyword fallback_namespace: Optional namespace to use for unqualified 297 names with no default namespace in scope. Has no effect unless it is 298 an absent namespace. 299 300 @keyword element_state_constructor: Optional callable object that 301 creates instances of L{SAXElementState} that hold element-specific 302 information. Defaults to L{SAXElementState}. 303 304 @keyword target_namespace: Optional namespace to set as the target 305 namespace. If not provided, there is no target namespace (not even an 306 absent one). This is the appropriate situation when processing plain 307 XML documents. 308 309 @keyword location_base: An object to be recorded as the base of all 310 L{pyxb.utils.utility.Location} instances associated with events and 311 objects handled by the parser. 312 """ 313 self.__includingContext = kw.pop('including_context', None) 314 self.__fallbackNamespace = kw.pop('fallback_namespace', None) 315 self.__elementStateConstructor = kw.pop('element_state_constructor', SAXElementState) 316 self.__targetNamespace = kw.pop('target_namespace', None) 317 self.__locationTemplate = pyxb.utils.utility.Location(kw.pop('location_base', None))
318
319 - def setDocumentLocator (self, locator):
320 """Save the locator object.""" 321 self.__locator = locator
322
323 - def startDocument (self):
324 """Process the start of a document. 325 326 This resets this handler for a new document. 327 @note: setDocumentLocator is invoked before startDocument 328 """ 329 self.reset()
330
332 ns_ctx = self.__nextNamespaceContext 333 if ns_ctx is None: 334 assert self.__namespaceContext is not None 335 ns_ctx = pyxb.namespace.resolution.NamespaceContext(parent_context=self.__namespaceContext) 336 self.__nextNamespaceContext = ns_ctx 337 return ns_ctx
338
339 - def startPrefixMapping (self, prefix, uri):
340 """Implement base class method. 341 342 @note: For this to be invoked, the C{feature_namespaces} feature must 343 be enabled in the SAX parser.""" 344 self.__getOrCreateNextNamespaceContext().processXMLNS(prefix, uri)
345 346 # The NamespaceContext management does not require any action upon 347 # leaving the scope of a namespace directive. 348 #def endPrefixMapping (self, prefix): 349 # pass 350
351 - def startElementNS (self, name, qname, attrs):
352 """Process the start of an element.""" 353 self.__flushPendingText() 354 355 # Get the element name, which is already a tuple with the namespace assigned. 356 expanded_name = pyxb.namespace.ExpandedName(name, fallback_namespace=self.__fallbackNamespace) 357 358 # See if this element supports a targetNamespace attribute. xs:schema 359 # and wsdl:definitions both do. 360 tns_attr = pyxb.namespace.resolution.NamespaceContext._TargetNamespaceAttribute(expanded_name) 361 362 # If we need to assign a target namespace, we need a new context. 363 # Otherwise we use the context created from pending namespace 364 # directives, or we re-use the current context. 365 if tns_attr is not None: 366 ns_ctx = self.__getOrCreateNextNamespaceContext() 367 else: 368 ns_ctx = self.__nextNamespaceContext 369 if ns_ctx is None: 370 # Re-use the active context 371 ns_ctx = self.__namespaceContext 372 else: 373 # Update the active context 374 self.__namespaceContext = ns_ctx 375 self.__nextNamespaceContext = None 376 377 if tns_attr is not None: 378 # Not true for wsdl 379 #assert ns_ctx.targetNamespace() is None 380 ns_ctx.finalizeTargetNamespace(attrs.get(tns_attr.uriTuple()), including_context=self.__includingContext) 381 assert ns_ctx.targetNamespace() is not None 382 383 # Save the state of the enclosing element, and create a new 384 # state for this element. 385 parent_state = self.__elementState 386 self.__elementStateStack.append(self.__elementState) 387 self.__elementState = this_state = self.__elementStateConstructor(content_handler=self, 388 expanded_name=expanded_name, 389 namespace_context=ns_ctx, 390 parent_state=parent_state) 391 return (this_state, parent_state, ns_ctx, expanded_name)
392
393 - def endElementNS (self, name, qname):
394 """Process the completion of an element.""" 395 self.__flushPendingText() 396 397 # Save the state of this element, and restore the state for 398 # the parent to which we are returning. 399 this_state = self.__elementState 400 parent_state = self.__elementState = self.__elementStateStack.pop() 401 # Restore namespace context and prepare for new namespace directives 402 self.__namespaceContext = parent_state.namespaceContext() 403 self.__nextNamespaceContext = None 404 405 return this_state
406 407 # We accumulate consecutive text events into a single event, primarily to 408 # avoid the confusion that results when the value of a simple type is 409 # represented by multiple events, as with "B &amp; W". Also, it's faster 410 # to join them all at once, and to process one content value rather than a 411 # sequence of them. 412 __pendingText = None 413 __pendingTextLocation = None
414 - def __flushPendingText (self):
415 if self.__pendingText: 416 location = self.__pendingTextLocation 417 if location is None: 418 location = self.location() 419 self.__elementState.addTextContent(location, ''.join(self.__pendingText)) 420 self.__pendingTextLocation = None 421 self.__pendingText = []
422
423 - def characters (self, content):
424 """Save the text as content""" 425 if self.__pendingTextLocation is None: 426 self.__pendingTextLocation = self.location() 427 self.__pendingText.append(content)
428
429 - def ignorableWhitespace (self, whitespace):
430 """Save whitespace as content too.""" 431 self.__pendingText.append(whitespace)
432
433 - def processingInstruction (self, target, data):
434 self.__flushPendingText()
435
436 -class _EntityResolver (object):
437 """Dummy used to prevent the SAX parser from crashing when it sees 438 processing instructions that we don't care about."""
439 - def resolveEntity (self, public_id, system_id):
440 return io.StringIO(u'')
441 442 _CreateParserModules = []
443 -def SetCreateParserModules (create_parser_modules):
444 """Provide list of modules to be used when creating parsers. 445 446 C{xml.sax.make_parser()} takes as a parameter an optional list of modules 447 which allow customization of the parser to be used. Certain parsers have 448 better support for Unicode than others. 449 450 As an example, providing C{["drv_libxml2"]} causes the libxml2 parser to 451 be used. 452 453 The default behavior if this function is not called, or if it is called 454 with an empty list or C{None}, is to provide no specific modules, which 455 will result in the system default parser (probably expat). 456 457 @param create_parser_modules: an iterable list of names of modules that 458 provide a C{create_parser} function. Pass C{None} to reset to the system 459 default. """ 460 global _CreateParserModules 461 if create_parser_modules is None: 462 _CreateParserModules = [] 463 else: 464 _CreateParserModules = list(create_parser_modules)
465
466 -def make_parser (**kw):
467 """Extend C{xml.sax.make_parser} to configure the parser the way we 468 need it: 469 470 - C{feature_namespaces} is set to C{True} so we process xmlns 471 directives properly 472 - C{feature_namespace_prefixes} is set to C{False} so we don't get 473 prefixes encoded into our names (probably redundant with the above but 474 still...) 475 476 All keywords not documented here (and C{fallback_namespace}, which is) are 477 passed to the C{content_handler_constructor} if that must be invoked. 478 479 @keyword content_handler: The content handler instance for the 480 parser to use. If not provided, an instance of C{content_handler_constructor} 481 is created and used. 482 @type content_handler: C{xml.sax.handler.ContentHandler} 483 484 @keyword content_handler_constructor: A callable which produces an 485 appropriate instance of (a subclass of) L{BaseSAXHandler}. The default is 486 L{BaseSAXHandler}. 487 488 @keyword fallback_namespace: The namespace to use for lookups of 489 unqualified names in absent namespaces; see 490 L{pyxb.namespace.ExpandedName}. This keyword is not used by this 491 function, but is passed to the C{content_handler_constructor}. 492 @type fallback_namespace: L{pyxb.namespace.Namespace} 493 """ 494 content_handler_constructor = kw.pop('content_handler_constructor', BaseSAXHandler) 495 content_handler = kw.pop('content_handler', None) 496 if content_handler is None: 497 content_handler = content_handler_constructor(**kw) 498 parser = xml.sax.make_parser(_CreateParserModules) 499 parser.setFeature(xml.sax.handler.feature_namespaces, True) 500 parser.setFeature(xml.sax.handler.feature_namespace_prefixes, False) 501 parser.setContentHandler(content_handler) 502 # libxml2 doesn't support this feature 503 try: 504 parser.setEntityResolver(_EntityResolver()) 505 except xml.sax.SAXNotSupportedException: 506 pass 507 return parser
508 509 if '__main__' == __name__: 510 import xml.dom.pulldom 511 import xml.dom.minidom 512 import pyxb.utils.saxdom as saxdom 513 import time 514 import lxml.sax 515 import lxml.etree 516 import sys 517 518 Handler = BaseSAXHandler 519 xml_file = 'examples/tmsxtvd/tmsdatadirect_sample.xml' 520 if 1 < len(sys.argv): 521 xml_file = sys.argv[1] 522 xmld = open(xml_file, 'rb').read() 523 524 dt1 = time.time() 525 dt2 = time.time() 526 dom = xml.dom.minidom.parse(io.BytesIO(xmld)) 527 dt3 = time.time() 528 529 snt1 = time.time() 530 saxer = make_parser(content_handler=_NoopSAXHandler()) 531 snt2 = time.time() 532 saxer.parse(io.BytesIO(xmld)) 533 snt3 = time.time() 534 535 sbt1 = time.time() 536 saxer = make_parser(content_handler=BaseSAXHandler()) 537 sbt2 = time.time() 538 saxer.parse(io.BytesIO(xmld)) 539 sbt3 = time.time() 540 541 pdt1 = time.time() 542 sdomer = make_parser(content_handler_constructor=saxdom._DOMSAXHandler) 543 h = sdomer.getContentHandler() 544 pdt2 = time.time() 545 sdomer.parse(io.BytesIO(xmld)) 546 pdt3 = time.time() 547 548 lst1 = time.time() 549 tree = lxml.etree.fromstring(xmld) 550 lst2 = time.time() 551 lsh = Handler() 552 lxml.sax.saxify(tree, lsh) 553 lst3 = time.time() 554 555 ldt1 = time.time() 556 tree = lxml.etree.fromstring(xmld) 557 ldt2 = time.time() 558 ldh = xml.dom.pulldom.SAX2DOM() 559 lxml.sax.saxify(tree, ldh) 560 ldt3 = time.time() 561 562 print('minidom read %f, parse %f, total %f' % (dt2-dt1, dt3-dt2, dt3-dt1)) 563 print('SAX+noop create %f, parse %f, total %f' % (snt2-snt1, snt3-snt2, snt3-snt1)) 564 print('SAX+ns create %f, parse %f, total %f' % (sbt2-sbt1, sbt3-sbt2, sbt3-sbt1)) 565 print('PyXB SAXDOM-based create %f, parse %f, total %f' % (pdt2-pdt1, pdt3-pdt2, pdt3-pdt1)) 566 print('LXML+SAX tree %f, parse %f, total %f' % (lst2-lst1, lst3-lst2, lst3-lst1)) 567 print('LXML+pulldom DOM tree %f, parse %f, total %f' % (ldt2-ldt1, ldt3-ldt2, ldt3-ldt1)) 568 569 ## Local Variables: 570 ## fill-column:78 571 ## End: 572