1
2
3
4
5
6
7
8
9
10
12 """ Attempts to detect the character encoding of the xml file
13 given by a file object fp. fp must not be a codec wrapped file
14 object!
15
16 The return value can be:
17 - if detection of the BOM succeeds, the codec name of the
18 corresponding unicode charset is returned
19
20 - if BOM detection fails, the xml declaration is searched for
21 the encoding attribute and its value returned. the "<"
22 character has to be the very first in the file then (it's xml
23 standard after all).
24
25 - if BOM and xml declaration fail, None is returned. According
26 to xml 1.0 it should be utf_8 then, but it wasn't detected by
27 the means offered here. at least one can be pretty sure that a
28 character coding including most of ASCII is used :-/
29 """
30
31
32
33 bomDict={
34 (0x00, 0x00, 0xFE, 0xFF) : "utf_32_be",
35 (0xFF, 0xFE, 0x00, 0x00) : "utf_32_le",
36 (0xFE, 0xFF, None, None) : "utf_16_be",
37 (0xFF, 0xFE, None, None) : "utf_16_le",
38 (0xEF, 0xBB, 0xBF, None) : "utf_8",
39 }
40
41
42 oldFP = fp.tell()
43 fp.seek(0)
44 (byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4)))
45
46
47 bomDetection = bomDict.get((byte1, byte2, byte3, byte4))
48 if not bomDetection :
49 bomDetection = bomDict.get((byte1, byte2, byte3, None))
50 if not bomDetection :
51 bomDetection = bomDict.get((byte1, byte2, None, None))
52
53
54 if bomDetection :
55 fp.seek(oldFP)
56 return bomDetection
57
58
59
60
61
62
63
64
65
66 import re
67
68
69 fp.seek(0)
70 buffer = fp.read(2048)
71
72
73 xmlDeclPattern = r"""
74 ^<\?xml # w/o BOM, xmldecl starts with <?xml at the first byte
75 .+? # some chars (version info), matched minimal
76 encoding= # encoding attribute begins
77 ["'] # attribute start delimiter
78 (?P<encstr> # what's matched in the brackets will be named encstr
79 [^"']+ # every character not delimiter (not overly exact!)
80 ) # closes the brackets pair for the named group
81 ["'] # attribute end delimiter
82 .*? # some chars optionally (standalone decl or whitespace)
83 \?> # xmldecl end
84 """
85
86 xmlDeclRE = re.compile(xmlDeclPattern, re.VERBOSE)
87
88
89 match = xmlDeclRE.search(buffer)
90 fp.seek(oldFP)
91 if match :
92 return match.group("encstr")
93 else :
94 return None
95