Package pyxb :: Package utils :: Module activestate
[hide private]
[frames] | no frames]

Source Code for Module pyxb.utils.activestate

 1  # This file contains code adopted from http://code.activestate.com/recipes/. 
 2  # Copyright and licensing information is associated with each incorporated 
 3  # recipe.  At this time, all code is licensed under PSF. 
 4   
 5  # http://code.activestate.com/recipes/363841/ 
 6  # by Lars Tiede, derivative from: 
 7  # http://code.activestate.com/recipes/52257/ 
 8  # by Paul Prescod 
 9  # Licensed under PSF 
10  # No changes 
11 -def detectXMLEncoding(fp):
12 """ Attempts to detect the character encoding of the xml file 13 given by a file object fp. fp must not be a codec wrapped file 14 object! 15 16 The return value can be: 17 - if detection of the BOM succeeds, the codec name of the 18 corresponding unicode charset is returned 19 20 - if BOM detection fails, the xml declaration is searched for 21 the encoding attribute and its value returned. the "<" 22 character has to be the very first in the file then (it's xml 23 standard after all). 24 25 - if BOM and xml declaration fail, None is returned. According 26 to xml 1.0 it should be utf_8 then, but it wasn't detected by 27 the means offered here. at least one can be pretty sure that a 28 character coding including most of ASCII is used :-/ 29 """ 30 ### detection using BOM 31 32 ## the BOMs we know, by their pattern 33 bomDict={ # bytepattern : name 34 (0x00, 0x00, 0xFE, 0xFF) : "utf_32_be", 35 (0xFF, 0xFE, 0x00, 0x00) : "utf_32_le", 36 (0xFE, 0xFF, None, None) : "utf_16_be", 37 (0xFF, 0xFE, None, None) : "utf_16_le", 38 (0xEF, 0xBB, 0xBF, None) : "utf_8", 39 } 40 41 ## go to beginning of file and get the first 4 bytes 42 oldFP = fp.tell() 43 fp.seek(0) 44 (byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4))) 45 46 ## try bom detection using 4 bytes, 3 bytes, or 2 bytes 47 bomDetection = bomDict.get((byte1, byte2, byte3, byte4)) 48 if not bomDetection : 49 bomDetection = bomDict.get((byte1, byte2, byte3, None)) 50 if not bomDetection : 51 bomDetection = bomDict.get((byte1, byte2, None, None)) 52 53 ## if BOM detected, we're done :-) 54 if bomDetection : 55 fp.seek(oldFP) 56 return bomDetection 57 58 59 ## still here? BOM detection failed. 60 ## now that BOM detection has failed we assume one byte character 61 ## encoding behaving ASCII - of course one could think of nice 62 ## algorithms further investigating on that matter, but I won't for now. 63 64 65 ### search xml declaration for encoding attribute 66 import re 67 68 ## assume xml declaration fits into the first 2 KB (*cough*) 69 fp.seek(0) 70 buffer = fp.read(2048) 71 72 ## set up regular expression 73 xmlDeclPattern = r""" 74 ^<\?xml # w/o BOM, xmldecl starts with <?xml at the first byte 75 .+? # some chars (version info), matched minimal 76 encoding= # encoding attribute begins 77 ["'] # attribute start delimiter 78 (?P<encstr> # what's matched in the brackets will be named encstr 79 [^"']+ # every character not delimiter (not overly exact!) 80 ) # closes the brackets pair for the named group 81 ["'] # attribute end delimiter 82 .*? # some chars optionally (standalone decl or whitespace) 83 \?> # xmldecl end 84 """ 85 86 xmlDeclRE = re.compile(xmlDeclPattern, re.VERBOSE) 87 88 ## search and extract encoding string 89 match = xmlDeclRE.search(buffer) 90 fp.seek(oldFP) 91 if match : 92 return match.group("encstr") 93 else : 94 return None
95