1
2
3
4
5
6
7
8
9
10
11
13 """ Attempts to detect the character encoding of the xml file
14 given by a file object fp. fp must not be a codec wrapped file
15 object!
16
17 The return value can be:
18 - if detection of the BOM succeeds, the codec name of the
19 corresponding unicode charset is returned
20
21 - if BOM detection fails, the xml declaration is searched for
22 the encoding attribute and its value returned. the "<"
23 character has to be the very first in the file then (it's xml
24 standard after all).
25
26 - if BOM and xml declaration fail, None is returned. According
27 to xml 1.0 it should be utf_8 then, but it wasn't detected by
28 the means offered here. at least one can be pretty sure that a
29 character coding including most of ASCII is used :-/
30 """
31
32
33
34 bomDict={
35 (0x00, 0x00, 0xFE, 0xFF) : "utf_32_be",
36 (0xFF, 0xFE, 0x00, 0x00) : "utf_32_le",
37 (0xFE, 0xFF, None, None) : "utf_16_be",
38 (0xFF, 0xFE, None, None) : "utf_16_le",
39 (0xEF, 0xBB, 0xBF, None) : "utf_8",
40 }
41
42
43 oldFP = fp.tell()
44 fp.seek(0)
45 (byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4)))
46
47
48 bomDetection = bomDict.get((byte1, byte2, byte3, byte4))
49 if not bomDetection :
50 bomDetection = bomDict.get((byte1, byte2, byte3, None))
51 if not bomDetection :
52 bomDetection = bomDict.get((byte1, byte2, None, None))
53
54
55 if bomDetection :
56 fp.seek(oldFP)
57 return bomDetection
58
59
60
61
62
63
64
65
66
67 import re
68
69
70 fp.seek(0)
71 buffer = fp.read(2048)
72
73
74 xmlDeclPattern = r"""
75 ^<\?xml # w/o BOM, xmldecl starts with <?xml at the first byte
76 .+? # some chars (version info), matched minimal
77 encoding= # encoding attribute begins
78 ["'] # attribute start delimiter
79 (?P<encstr> # what's matched in the brackets will be named encstr
80 [^"']+ # every character not delimiter (not overly exact!)
81 ) # closes the brackets pair for the named group
82 ["'] # attribute end delimiter
83 .*? # some chars optionally (standalone decl or whitespace)
84 \?> # xmldecl end
85 """
86
87 xmlDeclRE = re.compile(xmlDeclPattern, re.VERBOSE)
88
89
90 match = xmlDeclRE.search(buffer)
91 fp.seek(oldFP)
92 if match :
93 return match.group("encstr")
94 else :
95 return None
96