1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 """This module contains support for Unicode characters as required to
16 support the regular expression syntax defined in U{annex F
17 <http://www/Documentation/W3C/www.w3.org/TR/xmlschema-2/index.html#regexs>}
18 of the XML Schema definition.
19
20 In particular, we need to be able to identify character properties and
21 block escapes, as defined in F.1.1, by name.
22
23 - Block data: U{http://www.unicode.org/Public/3.1-Update/Blocks-4.txt}
24 - Property list data: U{http://www.unicode.org/Public/3.1-Update/PropList-3.1.0.txt}
25 - Full dataset: U{http://www.unicode.org/Public/3.1-Update/UnicodeData-3.1.0.txt}
26
27 The Unicode database active at the time XML Schema 1.0 was defined is
28 archived at
29 U{http://www.unicode.org/Public/3.1-Update/UnicodeCharacterDatabase-3.1.0.html},
30 and refers to U{Unicode Standard Annex #27: Unicode 3.1
31 <http://www.unicode.org/unicode/reports/tr27/>}.
32 """
33
34 import re
35
36 SupportsWideUnicode = False
37 try:
38 re.compile('[\U1d7ce-\U1d7ff]')
39 SupportsWideUnicode = True
40 except:
41 pass
42
43 import bisect
44
46 """Raised when some abuse of a L{CodePointSet} is detected."""
47 pass
48
50 """Represent a set of Unicode code points.
51
52 Each code point is an integral value between 0 and 0x10FFFF. This
53 class is used to represent a set of code points in a manner
54 suitable for use as regular expression character sets."""
55
56 MaxCodePoint = 0x10FFFF
57 """The maximum value for a code point in the Unicode code point
58 space. This is normally 0xFFFF, because wide unicode characters
59 are generally not enabled in Python builds. If, however, they are
60 enabled, this will be the full value of 0x10FFFF."""
61
62 MaxShortCodePoint = 0xFFFF
63 if not SupportsWideUnicode:
64 MaxCodePoint = MaxShortCodePoint
65
66
67
68
69
70
71
72
73 __codepoints = None
74
76 """For testing purrposes only, access to the codepoints
77 internal representation."""
78 return self.__codepoints
79
83
93
94
96
97
98 if isinstance(value, tuple):
99 (s, e) = value
100 e += 1
101 elif isinstance(value, basestring):
102 if 1 < len(value):
103 raise TypeError()
104 s = ord(value)
105 e = s+1
106 else:
107 s = int(value)
108 e = s+1
109 if s > e:
110 raise ValueError('codepoint range value order')
111
112
113
114 if s > self.MaxCodePoint:
115 return self
116 if e > self.MaxCodePoint:
117 e = self.MaxCodePoint
118 e = min(e, self.MaxCodePoint)
119
120
121 li = bisect.bisect_left(self.__codepoints, s)
122
123 ri = bisect.bisect_right(self.__codepoints, e)
124
125 case = ((li & 1) << 1) | (ri & 1)
126 if not do_add:
127 case = 3 - case
128
129 if 0x03 == case:
130
131 del self.__codepoints[li:ri]
132 elif 0x02 == case:
133
134 del self.__codepoints[li+1:ri]
135 self.__codepoints[li] = e
136 elif 0x01 == case:
137
138 del self.__codepoints[li+1:ri]
139 self.__codepoints[li] = s
140 else:
141
142 self.__codepoints[li:ri] = [s, e]
143 return self
144
145 - def add (self, value):
146 """Add the given value to the code point set.
147
148 @param value: An integral value denoting a code point, or a
149 tuple C{(s,e)} denoting the start and end (inclusive) code
150 points in a range.
151 @return: C{self}"""
152 return self.__mutate(value, True)
153
155 """Add multiple values to a code point set.
156
157 @param values: Either a L{CodePointSet} instance, or an iterable
158 whose members are valid parameters to L{add}.
159
160 @return: C{self}"""
161 if isinstance(values, CodePointSet):
162 self.extend(values.asTuples())
163 else:
164 [ self.__mutate(_v, True) for _v in values ]
165 return self
166
168 """Remove the given value from the code point set.
169
170 @param value: An integral value denoting a code point, or a tuple
171 C{(s,e)} denoting the start and end (inclusive) code points in a
172 range, or a L{CodePointSet}.
173
174 @return: C{self}"""
175 if isinstance(value, CodePointSet):
176 [ self.subtract(_v) for _v in value.asTuples() ]
177 return self
178 return self.__mutate(value, False)
179
180
181
182 __NotXMLChar_set = frozenset([ '-', '[', ']' ])
183
184
185
187 rv = unichr(code_point)
188 if rv in self.__NotXMLChar_set:
189 rv = u'\\' + rv
190 return rv
191
193 """Return the code point set as Unicode regular expression
194 character group consisting of a sequence of characters or
195 character ranges.
196
197 @param with_brackets: If C{True} (default), square brackets
198 are added to enclose the returned character group."""
199 rva = []
200 if with_brackets:
201 rva.append(u'[')
202 for (s, e) in self.asTuples():
203 if s == e:
204 rva.append(self.__unichr(s))
205 else:
206 rva.extend([self.__unichr(s), '-', self.__unichr(e)])
207 if with_brackets:
208 rva.append(u']')
209 return u''.join(rva)
210
212 """Return the codepoints as tuples denoting the ranges that are in
213 the set.
214
215 Each tuple C{(s, e)} indicates that the code points from C{s}
216 (inclusive) to C{e}) (inclusive) are in the set."""
217
218 rv = []
219 start = None
220 for ri in xrange(len(self.__codepoints)):
221 if start is not None:
222 rv.append( (start, self.__codepoints[ri]-1) )
223 start = None
224 else:
225 start = self.__codepoints[ri]
226 if start is not None:
227 rv.append( (start, self.MaxCodePoint) )
228 return rv
229
239
246
247 from unicode_data import *
248
249
250
251
252
253
254
255
256 _NameStartChar = CodePointSet(ord(':'),
257 ( ord('A'), ord('Z') ),
258 ord('_'),
259 ( ord('a'), ord('z') ),
260 ( 0xC0, 0xD6 ),
261 ( 0xD8, 0xF6 ),
262 ( 0xF8, 0x2FF ),
263 ( 0x370, 0x37D ),
264 ( 0x37F, 0x1FFF ),
265 ( 0x200C, 0x200D ),
266 ( 0x2070, 0x218F ),
267 ( 0x2C00, 0x2FEF ),
268 ( 0x3001, 0xD7FF ),
269 ( 0xF900, 0xFDCF ),
270 ( 0xFDF0, 0xFFFD ),
271 ( 0x10000, 0xEFFFF ) )
272
273
274 _NameChar = CodePointSet(_NameStartChar).extend([ ord('-'),
275 ord('.'),
276 ( ord('0'), ord('9') ),
277 0xB7,
278 ( 0x0300, 0x036F ),
279 ( 0x203F, 0x2040 ) ])
280
281
282 SingleCharEsc = { 'n' : CodePointSet(0x0A),
283 'r' : CodePointSet(0x0D),
284 't' : CodePointSet(0x09) }
285 for c in r'\|.-^?*+{}()[]':
286 SingleCharEsc[c] = CodePointSet(ord(c))
287
288
289 WildcardEsc = CodePointSet(ord('\n'), ord('\r')).negate()
290 MultiCharEsc = { }
291 MultiCharEsc['s'] = CodePointSet(0x20, ord('\t'), ord('\n'), ord('\r'))
292 MultiCharEsc['S'] = MultiCharEsc['s'].negate()
293 MultiCharEsc['i'] = _NameStartChar
294 MultiCharEsc['I'] = MultiCharEsc['i'].negate()
295 MultiCharEsc['c'] = _NameChar
296 MultiCharEsc['C'] = MultiCharEsc['c'].negate()
297 MultiCharEsc['d'] = PropertyMap['Nd']
298 MultiCharEsc['D'] = MultiCharEsc['d'].negate()
299 MultiCharEsc['W'] = CodePointSet(PropertyMap['P']).extend(PropertyMap['Z']).extend(PropertyMap['C'])
300 MultiCharEsc['w'] = MultiCharEsc['W'].negate()
301