1   
  2   
  3   
  4   
  5   
  6   
  7   
  8   
  9   
 10   
 11   
 12   
 13   
 14   
 15   
 16  """This module contains support for processing XML using a SAX parser. 
 17   
 18  In particular, it provides a L{base content handler class<BaseSAXHandler>} 
 19  that maintains namespace context and element state in a stack; and a L{base 
 20  element state class <SAXElementState>} which records the location of the 
 21  element in the stream.  These classes are extended for specific parsing needs 
 22  (e.g., L{pyxb.binding.saxer}). 
 23  """ 
 24   
 25  import xml.sax 
 26  import xml.sax.handler 
 27  import pyxb.namespace 
 28  import StringIO 
 29  import logging 
 30   
 31  _log = logging.getLogger(__name__) 
 32   
 34      """A SAX handler class which prints each method invocation. 
 35      """ 
 36   
 37       
 38      __trace = False 
 39   
 41          _log.debug('setDocumentLocator %s', locator) 
  42   
 44          _log.debug('startDocument') 
  45   
 48   
 51   
 53          _log.debug('startElementNS %s %s', name, qname) 
  54   
 56          _log.debug('endElementNS %s %s', name, qname) 
  57   
 60   
 62          _log.debug('ignorableWhitespace len %d', len(whitespace)) 
  63   
 65          _log.debug('processingInstruction %s %s', target, data) 
   66   
 68      """A SAX handler class which doesn't do anything.  Used to get baseline 
 69      performance parsing a particular document. 
 70      """ 
 71   
 74   
 77   
 80   
 83   
 86   
 89   
 92   
 95   
  98   
123   
125      """State corresponding to processing a given element with the SAX 
126      model.""" 
127   
128 -    def contentHandler (self): 
 129          """Reference to the C{xml.sxa.handler.ContentHandler} that is processing the document.""" 
130          return self.__contentHandler 
 131      __contentHandler = None 
132   
134          """Reference to the SAXElementState of the element enclosing this 
135          one.""" 
136          return self.__parentState 
 137      __parentState = None 
138   
139 -    def namespaceContext (self): 
 140          """The L{pyxb.namespace.resolution.NamespaceContext} used for this 
141          binding.""" 
142          return self.__namespaceContext 
 143      __namespaceContext = None 
144   
146          """The L{expanded name<pyxb.namespace.ExpandedName>} of the 
147          element.""" 
148          return self.__expandedName 
 149      __expandedName = None 
150   
152          """The L{location<pyxb.utils.utility.Location>} corresponding to the 
153          element event.""" 
154          return self.__location 
 155      __location = None 
156   
157 -    def content (self): 
 158          """An accumulation of content to be supplied to the content model when 
159          the element end is reached. 
160   
161          This is a list, with each member being C{(content, element_use, 
162          maybe_element)}.  C{content} is text or a binding instance; 
163          C{element_use} is C{None} or the 
164          L{ElementDeclaration<pyxb.binding.content.ElementDeclaration>} instance used to create 
165          the content; and C{maybe_element} is C{True} iff the content is 
166          non-content text.""" 
167          return self.__content 
 168      __content = None 
169   
178   
179 -    def addTextContent (self, location, content): 
 180          """Add the given text as non-element content of the current element. 
181          @type content: C{unicode} or C{str} 
182          @return: C{self} 
183          """ 
184          self.__content.append(SAXInformationItem(location, content, False)) 
 185   
186 -    def addElementContent (self, location, element, element_decl=None): 
 187          """Add the given binding instance as element content corresponding to 
188          the given use. 
189   
190          @param element: Any L{binding instance<pyxb.binding.basis._TypeBinding_mixin>}. 
191   
192          @param element_decl: The L{element 
193          use<pyxb.binding.content.ElementDeclaration>} in the containing complex type. 
194          """ 
195          self.__content.append(SAXInformationItem(location, element, True, element_decl)) 
  196   
198      """A SAX handler class that maintains a stack of enclosing elements and 
199      manages namespace declarations. 
200   
201      This is the base for L{pyxb.utils.saxdom._DOMSAXHandler} and 
202      L{pyxb.binding.saxer.PyXBSAXHandler}. 
203      """ 
204   
205       
206       
207      __locationTemplate = None 
208   
212   
213       
214       
215       
216      __elementStateConstructor = None 
217   
218       
219       
221          """Return the namespace used to resolve unqualified names with no default namespace.""" 
222          return self.__fallbackNamespace 
 223      __fallbackNamespace = None 
224   
225       
226       
227       
228       
229       
230      __nextNamespaceContext = None 
231   
232       
233 -    def namespaceContext (self): 
 234          """Return the namespace context used for QName resolution within the 
235          current element. 
236   
237          @return: An instance of L{pyxb.namespace.resolution.NamespaceContext}""" 
238          return self.__namespaceContext 
 239      __namespaceContext = None 
240   
241       
242       
243       
244       
245      __includingContext = None 
246   
247       
248       
249      __locator = None 
250   
251       
254      __elementState = None 
255   
256       
257      __elementStateStack = [] 
258   
260          """Return the binding object corresponding to the top-most 
261          element in the document 
262   
263          @return: An instance of L{basis._TypeBinding_mixin} (most usually a 
264          L{basis.complexTypeDefinition}.""" 
265          return self.__rootObject 
 266      __rootObject = None 
267   
286   
288          """Create a new C{xml.sax.handler.ContentHandler} instance to maintain state relevant to elements. 
289   
290          @keyword fallback_namespace: Optional namespace to use for unqualified 
291          names with no default namespace in scope.  Has no effect unless it is 
292          an absent namespace. 
293   
294          @keyword element_state_constructor: Optional callable object that 
295          creates instances of L{SAXElementState} that hold element-specific 
296          information.  Defaults to L{SAXElementState}. 
297   
298          @keyword target_namespace: Optional namespace to set as the target 
299          namespace.  If not provided, there is no target namespace (not even an 
300          absent one).  This is the appropriate situation when processing plain 
301          XML documents. 
302   
303          @keyword location_base: An object to be recorded as the base of all 
304          L{pyxb.utils.utility.Location} instances associated with events and 
305          objects handled by the parser. 
306          """ 
307          self.__includingContext = kw.pop('including_context', None) 
308          self.__fallbackNamespace = kw.pop('fallback_namespace', None) 
309          self.__elementStateConstructor = kw.pop('element_state_constructor', SAXElementState) 
310          self.__targetNamespace = kw.pop('target_namespace', None) 
311          self.__locationTemplate = pyxb.utils.utility.Location(kw.pop('location_base', None)) 
 312   
313       
314       
320   
322          """Save the locator object.""" 
323          self.__locator = locator 
 324   
326          """Process the start of a document. 
327   
328          This resets this handler for a new document. 
329          @note: setDocumentLocator is invoked before startDocument 
330          """ 
331          self.reset() 
 332   
339   
340       
341       
342       
343       
344   
373   
386   
387       
388       
389       
390       
391       
392      __pendingText = None 
393      __pendingTextLocation = None 
402   
408   
412   
 415   
417      """Dummy used to prevent the SAX parser from crashing when it sees 
418      processing instructions that we don't care about.""" 
420          return StringIO.StringIO('') 
  421   
422  _CreateParserModules = [] 
424      """Provide list of modules to be used when creating parsers. 
425   
426      C{xml.sax.make_parser()} takes as a parameter an optional list of modules 
427      which allow customization of the parser to be used.  Certain parsers have 
428      better support for Unicode than others. 
429   
430      As an example, providing C{["drv_libxml2"]} causes the libxml2 parser to 
431      be used. 
432   
433      The default behavior if this function is not called, or if it is called 
434      with an empty list or C{None}, is to provide no specific modules, which 
435      will result in the system default parser (probably expat). 
436   
437      @param create_parser_modules: an iterable list of names of modules that 
438      provide a C{create_parser} function.  Pass C{None} to reset to the system 
439      default.  """ 
440      global _CreateParserModules 
441      if create_parser_modules is None: 
442          _CreateParserModules = [] 
443      else: 
444          _CreateParserModules = list(create_parser_modules) 
 445   
447      """Extend C{xml.sax.make_parser} to configure the parser the way we 
448      need it: 
449   
450        - C{feature_namespaces} is set to C{True} so we process xmlns 
451          directives properly 
452        - C{feature_namespace_prefixes} is set to C{False} so we don't get 
453          prefixes encoded into our names (probably redundant with the above but 
454          still...) 
455   
456      All keywords not documented here (and C{fallback_namespace}, which is) are 
457      passed to the C{content_handler_constructor} if that must be invoked. 
458   
459      @keyword content_handler: The content handler instance for the 
460      parser to use.  If not provided, an instance of C{content_handler_constructor} 
461      is created and used. 
462      @type content_handler: C{xml.sax.handler.ContentHandler} 
463   
464      @keyword content_handler_constructor: A callable which produces an 
465      appropriate instance of (a subclass of) L{BaseSAXHandler}.  The default is 
466      L{BaseSAXHandler}. 
467   
468      @keyword fallback_namespace: The namespace to use for lookups of 
469      unqualified names in absent namespaces; see 
470      L{pyxb.namespace.ExpandedName}.  This keyword is not used by this 
471      function, but is passed to the C{content_handler_constructor}. 
472      @type fallback_namespace: L{pyxb.namespace.Namespace} 
473      """ 
474      content_handler_constructor = kw.pop('content_handler_constructor', BaseSAXHandler) 
475      content_handler = kw.pop('content_handler', None) 
476      if content_handler is None: 
477          content_handler = content_handler_constructor(**kw) 
478      parser = xml.sax.make_parser(_CreateParserModules) 
479      parser.setFeature(xml.sax.handler.feature_namespaces, True) 
480      parser.setFeature(xml.sax.handler.feature_namespace_prefixes, False) 
481      parser.setContentHandler(content_handler) 
482       
483      try: 
484          parser.setEntityResolver(_EntityResolver()) 
485      except xml.sax.SAXNotSupportedException: 
486          pass 
487      return parser 
 488   
489  if '__main__' == __name__: 
490      import xml.dom.pulldom 
491      import xml.dom.minidom 
492      import pyxb.utils.saxdom as saxdom 
493      import time 
494      import lxml.sax 
495      import lxml.etree 
496      import sys 
497   
498      Handler = BaseSAXHandler 
499      xml_file = 'examples/tmsxtvd/tmsdatadirect_sample.xml' 
500      if 1 < len(sys.argv): 
501          xml_file = sys.argv[1] 
502      xmls = open(xml_file).read() 
503   
504      dt1 = time.time() 
505      dt2 = time.time() 
506      dom = xml.dom.minidom.parseString(xmls) 
507      dt3 = time.time() 
508   
509      snt1 = time.time() 
510      saxer = make_parser(content_handler=_NoopSAXHandler()) 
511      snt2 = time.time() 
512      saxer.parse(StringIO.StringIO(xmls)) 
513      snt3 = time.time() 
514   
515      sbt1 = time.time() 
516      saxer = make_parser(content_handler=BaseSAXHandler()) 
517      sbt2 = time.time() 
518      saxer.parse(StringIO.StringIO(xmls)) 
519      sbt3 = time.time() 
520   
521      pdt1 = time.time() 
522      sdomer = make_parser(content_handler_constructor=saxdom._DOMSAXHandler) 
523      h = sdomer.getContentHandler() 
524      pdt2 = time.time() 
525      sdomer.parse(StringIO.StringIO(xmls)) 
526      pdt3 = time.time() 
527   
528      lst1 = time.time() 
529      tree = lxml.etree.fromstring(xmls) 
530      lst2 = time.time() 
531      lsh = Handler() 
532      lxml.sax.saxify(tree, lsh) 
533      lst3 = time.time() 
534   
535      ldt1 = time.time() 
536      tree = lxml.etree.fromstring(xmls) 
537      ldt2 = time.time() 
538      ldh = xml.dom.pulldom.SAX2DOM() 
539      lxml.sax.saxify(tree, ldh) 
540      ldt3 = time.time() 
541   
542      print 'minidom read %f, parse %f, total %f' % (dt2-dt1, dt3-dt2, dt3-dt1) 
543      print 'SAX+noop create %f, parse %f, total %f' % (snt2-snt1, snt3-snt2, snt3-snt1) 
544      print 'SAX+ns create %f, parse %f, total %f' % (sbt2-sbt1, sbt3-sbt2, sbt3-sbt1) 
545      print 'PyXB SAXDOM-based create %f, parse %f, total %f' % (pdt2-pdt1, pdt3-pdt2, pdt3-pdt1) 
546      print 'LXML+SAX tree %f, parse %f, total %f' % (lst2-lst1, lst3-lst2, lst3-lst1) 
547      print 'LXML+pulldom DOM tree %f, parse %f, total %f' % (ldt2-ldt1, ldt3-ldt2, ldt3-ldt1) 
548   
549   
550   
551   
552