Package pyxb :: Package utils :: Module saxutils
[hide private]
[frames] | no frames]

Source Code for Module pyxb.utils.saxutils

  1  # -*- coding: utf-8 -*- 
  2  # Copyright 2009-2012, Peter A. Bigot 
  3  # 
  4  # Licensed under the Apache License, Version 2.0 (the "License"); you may 
  5  # not use this file except in compliance with the License. You may obtain a 
  6  # copy of the License at: 
  7  # 
  8  #            http://www.apache.org/licenses/LICENSE-2.0 
  9  # 
 10  # Unless required by applicable law or agreed to in writing, software 
 11  # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 
 12  # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 
 13  # License for the specific language governing permissions and limitations 
 14  # under the License. 
 15   
 16  """This module contains support for processing XML using a SAX parser. 
 17   
 18  In particular, it provides a L{base content handler class<BaseSAXHandler>} 
 19  that maintains namespace context and element state in a stack; and a L{base 
 20  element state class <SAXElementState>} which records the location of the 
 21  element in the stream.  These classes are extended for specific parsing needs 
 22  (e.g., L{pyxb.binding.saxer}). 
 23  """ 
 24   
 25  import xml.sax 
 26  import xml.sax.handler 
 27  import pyxb.namespace 
 28  import StringIO 
 29  import logging 
 30   
 31  _log = logging.getLogger(__name__) 
 32   
33 -class TracingSAXHandler (xml.sax.handler.ContentHandler):
34 """A SAX handler class which prints each method invocation. 35 """ 36 37 # Whether invocation of handler methods should be traced 38 __trace = False 39
40 - def setDocumentLocator (self, locator):
41 _log.debug('setDocumentLocator %s', locator)
42
43 - def startDocument (self):
44 _log.debug('startDocument')
45
46 - def startPrefixMapping (self, prefix, uri):
47 _log.debug('startPrefixMapping %s %s', prefix, uri)
48
49 - def endPrefixMapping (self, prefix):
50 _log.debug('endPrefixMapping %s', prefix)
51
52 - def startElementNS (self, name, qname, attrs):
53 _log.debug('startElementNS %s %s', name, qname)
54
55 - def endElementNS (self, name, qname):
56 _log.debug('endElementNS %s %s', name, qname)
57
58 - def characters (self, content):
59 _log.debug('characters %s', content)
60
61 - def ignorableWhitespace (self, whitespace):
62 _log.debug('ignorableWhitespace len %d', len(whitespace))
63
64 - def processingInstruction (self, target, data):
65 _log.debug('processingInstruction %s %s', target, data)
66
67 -class _NoopSAXHandler (xml.sax.handler.ContentHandler):
68 """A SAX handler class which doesn't do anything. Used to get baseline 69 performance parsing a particular document. 70 """ 71
72 - def setDocumentLocator (self, locator):
73 pass
74
75 - def startDocument (self):
76 pass
77
78 - def startPrefixMapping (self, prefix, uri):
79 pass
80
81 - def endPrefixMapping (self, prefix):
82 pass
83
84 - def startElementNS (self, name, qname, attrs):
85 pass
86
87 - def endElementNS (self, name, qname):
88 pass
89
90 - def characters (self, content):
91 pass
92
93 - def ignorableWhitespace (self, whitespace):
94 pass
95
96 - def processingInstruction (self, target, data):
97 pass
98
99 -class SAXInformationItem (object):
100 """Class used to capture an item discovered in the body of an element.""" 101 102 location = None 103 """Where the item began in the document.""" 104 105 item = None 106 """The item. Generally either character information (as text) or a DOM 107 Node instance or a binding instance.""" 108 109 maybe_element = None 110 """C{False} iff the L{item} is character information as opposed to element content.""" 111 112 element_decl = None 113 """A reference to the 114 L{ElementDeclaration<pyxb.binding.content.ElementDeclaration>} used for 115 the L{item}. This will be C{None} for element content that does not have 116 an enclosing CTD scope.""" 117
118 - def __init__ (self, location, item, maybe_element, element_decl=None):
119 self.location = location 120 self.item = item 121 self.maybe_element = maybe_element 122 self.element_decl = element_decl
123
124 -class SAXElementState (object):
125 """State corresponding to processing a given element with the SAX 126 model.""" 127
128 - def contentHandler (self):
129 """Reference to the C{xml.sxa.handler.ContentHandler} that is processing the document.""" 130 return self.__contentHandler
131 __contentHandler = None 132
133 - def parentState (self):
134 """Reference to the SAXElementState of the element enclosing this 135 one.""" 136 return self.__parentState
137 __parentState = None 138
139 - def namespaceContext (self):
140 """The L{pyxb.namespace.resolution.NamespaceContext} used for this 141 binding.""" 142 return self.__namespaceContext
143 __namespaceContext = None 144
145 - def expandedName (self):
146 """The L{expanded name<pyxb.namespace.ExpandedName>} of the 147 element.""" 148 return self.__expandedName
149 __expandedName = None 150
151 - def location (self):
152 """The L{location<pyxb.utils.utility.Location>} corresponding to the 153 element event.""" 154 return self.__location
155 __location = None 156
157 - def content (self):
158 """An accumulation of content to be supplied to the content model when 159 the element end is reached. 160 161 This is a list, with each member being C{(content, element_use, 162 maybe_element)}. C{content} is text or a binding instance; 163 C{element_use} is C{None} or the 164 L{ElementDeclaration<pyxb.binding.content.ElementDeclaration>} instance used to create 165 the content; and C{maybe_element} is C{True} iff the content is 166 non-content text.""" 167 return self.__content
168 __content = None 169
170 - def __init__ (self, **kw):
171 self.__expandedName = kw.get('expanded_name') 172 self.__namespaceContext = kw['namespace_context'] 173 self.__parentState = kw.get('parent_state') 174 self.__contentHandler = kw.get('content_handler') 175 assert self.__contentHandler is not None 176 self.__location = self.__contentHandler.location() 177 self.__content = []
178
179 - def addTextContent (self, location, content):
180 """Add the given text as non-element content of the current element. 181 @type content: C{unicode} or C{str} 182 @return: C{self} 183 """ 184 self.__content.append(SAXInformationItem(location, content, False))
185
186 - def addElementContent (self, location, element, element_decl=None):
187 """Add the given binding instance as element content corresponding to 188 the given use. 189 190 @param element: Any L{binding instance<pyxb.binding.basis._TypeBinding_mixin>}. 191 192 @param element_decl: The L{element 193 use<pyxb.binding.content.ElementDeclaration>} in the containing complex type. 194 """ 195 self.__content.append(SAXInformationItem(location, element, True, element_decl))
196
197 -class BaseSAXHandler (xml.sax.handler.ContentHandler, object):
198 """A SAX handler class that maintains a stack of enclosing elements and 199 manages namespace declarations. 200 201 This is the base for L{pyxb.utils.saxdom._DOMSAXHandler} and 202 L{pyxb.binding.saxer.PyXBSAXHandler}. 203 """ 204 205 # An instance of L{pyxb.utils.utility.Location} that will be used to 206 # construct the locations of events as they are received. 207 __locationTemplate = None 208
209 - def location (self):
210 """Return the current location within the SAX-processed document.""" 211 return self.__locationTemplate.newLocation(self.__locator)
212 213 # The callable that creates an instance of (a subclass of) 214 # L{SAXElementState} as required to hold element-specific information as 215 # parsing proceeds. 216 __elementStateConstructor = None 217 218 # The namespace to use when processing a document with an absent default 219 # namespace.
220 - def fallbackNamespace (self):
221 """Return the namespace used to resolve unqualified names with no default namespace.""" 222 return self.__fallbackNamespace
223 __fallbackNamespace = None 224 225 # The namespace context that will be in effect at the start of the 226 # next element. One of these is allocated at the start of each 227 # element; it moves to become the current namespace upon receipt 228 # of either the next element start or a namespace directive that 229 # will apply at that element start. 230 __nextNamespaceContext = None 231 232 # The namespace context that is in effect for this element.
233 - def namespaceContext (self):
234 """Return the namespace context used for QName resolution within the 235 current element. 236 237 @return: An instance of L{pyxb.namespace.resolution.NamespaceContext}""" 238 return self.__namespaceContext
239 __namespaceContext = None 240 241 # The namespace context in a schema that is including the schema to be 242 # parsed by this handler. This is necessary to handle section 4.2.1 when 243 # a schema with a non-absent target namespace includes a schema with no 244 # target namespace. 245 __includingContext = None 246 247 # A SAX locator object. @todo: Figure out how to associate the 248 # location information with the binding objects. 249 __locator = None 250 251 # The state for the element currently being processed
252 - def elementState (self):
253 return self.__elementState
254 __elementState = None 255 256 # The states for all enclosing elements 257 __elementStateStack = [] 258
259 - def rootObject (self):
260 """Return the binding object corresponding to the top-most 261 element in the document 262 263 @return: An instance of L{basis._TypeBinding_mixin} (most usually a 264 L{basis.complexTypeDefinition}.""" 265 return self.__rootObject
266 __rootObject = None 267
268 - def reset (self):
269 """Reset the state of the handler in preparation for processing a new 270 document. 271 272 @return: C{self} 273 """ 274 self.__namespaceContext = pyxb.namespace.resolution.NamespaceContext(default_namespace=self.__fallbackNamespace, 275 target_namespace=self.__targetNamespace, 276 including_context=self.__includingContext, 277 finalize_target_namespace=False) 278 self.__nextNamespaceContext = None 279 self.__elementState = self.__elementStateConstructor(content_handler=self, 280 namespace_context=self.__namespaceContext) 281 self.__elementStateStack = [] 282 self.__rootObject = None 283 # Note: setDocumentLocator is invoked before startDocument (which 284 # calls this), so this method should not reset it. 285 return self
286
287 - def __init__ (self, **kw):
288 """Create a new C{xml.sax.handler.ContentHandler} instance to maintain state relevant to elements. 289 290 @keyword fallback_namespace: Optional namespace to use for unqualified 291 names with no default namespace in scope. Has no effect unless it is 292 an absent namespace. 293 294 @keyword element_state_constructor: Optional callable object that 295 creates instances of L{SAXElementState} that hold element-specific 296 information. Defaults to L{SAXElementState}. 297 298 @keyword target_namespace: Optional namespace to set as the target 299 namespace. If not provided, there is no target namespace (not even an 300 absent one). This is the appropriate situation when processing plain 301 XML documents. 302 303 @keyword location_base: An object to be recorded as the base of all 304 L{pyxb.utils.utility.Location} instances associated with events and 305 objects handled by the parser. 306 """ 307 self.__includingContext = kw.pop('including_context', None) 308 self.__fallbackNamespace = kw.pop('fallback_namespace', None) 309 self.__elementStateConstructor = kw.pop('element_state_constructor', SAXElementState) 310 self.__targetNamespace = kw.pop('target_namespace', None) 311 self.__locationTemplate = pyxb.utils.utility.Location(kw.pop('location_base', None))
312 313 # If there's a new namespace waiting to be used, make it the 314 # current namespace. Return the current namespace.
315 - def __updateNamespaceContext (self):
316 if self.__nextNamespaceContext is not None: 317 self.__namespaceContext = self.__nextNamespaceContext 318 self.__nextNamespaceContext = None 319 return self.__namespaceContext
320
321 - def setDocumentLocator (self, locator):
322 """Save the locator object.""" 323 self.__locator = locator
324
325 - def startDocument (self):
326 """Process the start of a document. 327 328 This resets this handler for a new document. 329 @note: setDocumentLocator is invoked before startDocument 330 """ 331 self.reset()
332
333 - def startPrefixMapping (self, prefix, uri):
334 """Implement base class method. 335 336 @note: For this to be invoked, the C{feature_namespaces} feature must 337 be enabled in the SAX parser.""" 338 self.__updateNamespaceContext().processXMLNS(prefix, uri)
339 340 # The NamespaceContext management does not require any action upon 341 # leaving the scope of a namespace directive. 342 #def endPrefixMapping (self, prefix): 343 # pass 344
345 - def startElementNS (self, name, qname, attrs):
346 """Process the start of an element.""" 347 self.__flushPendingText() 348 349 # Get the context to be used for this element, and create a 350 # new context for the next contained element to be found. 351 ns_ctx = self.__updateNamespaceContext() 352 353 # Get the element name, which is already a tuple with the namespace assigned. 354 expanded_name = pyxb.namespace.ExpandedName(name, fallback_namespace=self.__fallbackNamespace) 355 356 tns_attr = pyxb.namespace.resolution.NamespaceContext._TargetNamespaceAttribute(expanded_name) 357 if tns_attr is not None: 358 # Not true for wsdl 359 #assert ns_ctx.targetNamespace() is None 360 ns_ctx.finalizeTargetNamespace(attrs.get(tns_attr.uriTuple()), including_context=self.__includingContext) 361 assert ns_ctx.targetNamespace() is not None 362 self.__nextNamespaceContext = pyxb.namespace.resolution.NamespaceContext(parent_context=ns_ctx) 363 364 # Save the state of the enclosing element, and create a new 365 # state for this element. 366 parent_state = self.__elementState 367 self.__elementStateStack.append(self.__elementState) 368 self.__elementState = this_state = self.__elementStateConstructor(content_handler=self, 369 expanded_name=expanded_name, 370 namespace_context=ns_ctx, 371 parent_state=parent_state) 372 return (this_state, parent_state, ns_ctx, expanded_name)
373
374 - def endElementNS (self, name, qname):
375 """Process the completion of an element.""" 376 self.__flushPendingText() 377 378 # Save the state of this element, and restore the state for 379 # the parent to which we are returning. 380 this_state = self.__elementState 381 parent_state = self.__elementState = self.__elementStateStack.pop() 382 self.__nextNamespaceContext = None 383 self.__namespaceContext = parent_state.namespaceContext() 384 385 return this_state
386 387 # We accumulate consecutive text events into a single event, primarily to 388 # avoid the confusion that results when the value of a simple type is 389 # represented by multiple events, as with "B &amp; W". Also, it's faster 390 # to join them all at once, and to process one content value rather than a 391 # sequence of them. 392 __pendingText = None 393 __pendingTextLocation = None
394 - def __flushPendingText (self):
395 if self.__pendingText: 396 location = self.__pendingTextLocation 397 if location is None: 398 location = self.location() 399 self.__elementState.addTextContent(location, ''.join(self.__pendingText)) 400 self.__pendingTextLocation = None 401 self.__pendingText = []
402
403 - def characters (self, content):
404 """Save the text as content""" 405 if self.__pendingTextLocation is None: 406 self.__pendingTextLocation = self.location() 407 self.__pendingText.append(content)
408
409 - def ignorableWhitespace (self, whitespace):
410 """Save whitespace as content too.""" 411 self.__pendingText.append(whitespace)
412
413 - def processingInstruction (self, target, data):
414 self.__flushPendingText()
415
416 -class _EntityResolver (object):
417 """Dummy used to prevent the SAX parser from crashing when it sees 418 processing instructions that we don't care about."""
419 - def resolveEntity (self, public_id, system_id):
420 return StringIO.StringIO('')
421 422 _CreateParserModules = []
423 -def SetCreateParserModules (create_parser_modules):
424 """Provide list of modules to be used when creating parsers. 425 426 C{xml.sax.make_parser()} takes as a parameter an optional list of modules 427 which allow customization of the parser to be used. Certain parsers have 428 better support for Unicode than others. 429 430 As an example, providing C{["drv_libxml2"]} causes the libxml2 parser to 431 be used. 432 433 The default behavior if this function is not called, or if it is called 434 with an empty list or C{None}, is to provide no specific modules, which 435 will result in the system default parser (probably expat). 436 437 @param create_parser_modules: an iterable list of names of modules that 438 provide a C{create_parser} function. Pass C{None} to reset to the system 439 default. """ 440 global _CreateParserModules 441 if create_parser_modules is None: 442 _CreateParserModules = [] 443 else: 444 _CreateParserModules = list(create_parser_modules)
445
446 -def make_parser (**kw):
447 """Extend C{xml.sax.make_parser} to configure the parser the way we 448 need it: 449 450 - C{feature_namespaces} is set to C{True} so we process xmlns 451 directives properly 452 - C{feature_namespace_prefixes} is set to C{False} so we don't get 453 prefixes encoded into our names (probably redundant with the above but 454 still...) 455 456 All keywords not documented here (and C{fallback_namespace}, which is) are 457 passed to the C{content_handler_constructor} if that must be invoked. 458 459 @keyword content_handler: The content handler instance for the 460 parser to use. If not provided, an instance of C{content_handler_constructor} 461 is created and used. 462 @type content_handler: C{xml.sax.handler.ContentHandler} 463 464 @keyword content_handler_constructor: A callable which produces an 465 appropriate instance of (a subclass of) L{BaseSAXHandler}. The default is 466 L{BaseSAXHandler}. 467 468 @keyword fallback_namespace: The namespace to use for lookups of 469 unqualified names in absent namespaces; see 470 L{pyxb.namespace.ExpandedName}. This keyword is not used by this 471 function, but is passed to the C{content_handler_constructor}. 472 @type fallback_namespace: L{pyxb.namespace.Namespace} 473 """ 474 content_handler_constructor = kw.pop('content_handler_constructor', BaseSAXHandler) 475 content_handler = kw.pop('content_handler', None) 476 if content_handler is None: 477 content_handler = content_handler_constructor(**kw) 478 parser = xml.sax.make_parser(_CreateParserModules) 479 parser.setFeature(xml.sax.handler.feature_namespaces, True) 480 parser.setFeature(xml.sax.handler.feature_namespace_prefixes, False) 481 parser.setContentHandler(content_handler) 482 # libxml2 doesn't support this feature 483 try: 484 parser.setEntityResolver(_EntityResolver()) 485 except xml.sax.SAXNotSupportedException: 486 pass 487 return parser
488 489 if '__main__' == __name__: 490 import xml.dom.pulldom 491 import xml.dom.minidom 492 import pyxb.utils.saxdom as saxdom 493 import time 494 import lxml.sax 495 import lxml.etree 496 import sys 497 498 Handler = BaseSAXHandler 499 xml_file = 'examples/tmsxtvd/tmsdatadirect_sample.xml' 500 if 1 < len(sys.argv): 501 xml_file = sys.argv[1] 502 xmls = open(xml_file).read() 503 504 dt1 = time.time() 505 dt2 = time.time() 506 dom = xml.dom.minidom.parseString(xmls) 507 dt3 = time.time() 508 509 snt1 = time.time() 510 saxer = make_parser(content_handler=_NoopSAXHandler()) 511 snt2 = time.time() 512 saxer.parse(StringIO.StringIO(xmls)) 513 snt3 = time.time() 514 515 sbt1 = time.time() 516 saxer = make_parser(content_handler=BaseSAXHandler()) 517 sbt2 = time.time() 518 saxer.parse(StringIO.StringIO(xmls)) 519 sbt3 = time.time() 520 521 pdt1 = time.time() 522 sdomer = make_parser(content_handler_constructor=saxdom._DOMSAXHandler) 523 h = sdomer.getContentHandler() 524 pdt2 = time.time() 525 sdomer.parse(StringIO.StringIO(xmls)) 526 pdt3 = time.time() 527 528 lst1 = time.time() 529 tree = lxml.etree.fromstring(xmls) 530 lst2 = time.time() 531 lsh = Handler() 532 lxml.sax.saxify(tree, lsh) 533 lst3 = time.time() 534 535 ldt1 = time.time() 536 tree = lxml.etree.fromstring(xmls) 537 ldt2 = time.time() 538 ldh = xml.dom.pulldom.SAX2DOM() 539 lxml.sax.saxify(tree, ldh) 540 ldt3 = time.time() 541 542 print 'minidom read %f, parse %f, total %f' % (dt2-dt1, dt3-dt2, dt3-dt1) 543 print 'SAX+noop create %f, parse %f, total %f' % (snt2-snt1, snt3-snt2, snt3-snt1) 544 print 'SAX+ns create %f, parse %f, total %f' % (sbt2-sbt1, sbt3-sbt2, sbt3-sbt1) 545 print 'PyXB SAXDOM-based create %f, parse %f, total %f' % (pdt2-pdt1, pdt3-pdt2, pdt3-pdt1) 546 print 'LXML+SAX tree %f, parse %f, total %f' % (lst2-lst1, lst3-lst2, lst3-lst1) 547 print 'LXML+pulldom DOM tree %f, parse %f, total %f' % (ldt2-ldt1, ldt3-ldt2, ldt3-ldt1) 548 549 ## Local Variables: 550 ## fill-column:78 551 ## End: 552