1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 """This module contains support for Unicode characters as required to
16 support the regular expression syntax defined in U{annex F
17 <http://www/Documentation/W3C/www.w3.org/TR/xmlschema-2/index.html#regexs>}
18 of the XML Schema definition.
19
20 In particular, we need to be able to identify character properties and
21 block escapes, as defined in F.1.1, by name.
22
23 - Block data: U{http://www.unicode.org/Public/3.1-Update/Blocks-4.txt}
24 - Property list data: U{http://www.unicode.org/Public/3.1-Update/PropList-3.1.0.txt}
25 - Full dataset: U{http://www.unicode.org/Public/3.1-Update/UnicodeData-3.1.0.txt}
26
27 The Unicode database active at the time XML Schema 1.0 was defined is
28 archived at
29 U{http://www.unicode.org/Public/3.1-Update/UnicodeCharacterDatabase-3.1.0.html},
30 and refers to U{Unicode Standard Annex #27: Unicode 3.1
31 <http://www.unicode.org/unicode/reports/tr27/>}.
32 """
33
34 import re
35
36 SupportsWideUnicode = False
37 try:
38 re.compile(u'[\U0001d7ce-\U0001d7ff]')
39 SupportsWideUnicode = True
40 except:
41 pass
42
43 import bisect
44
46 """Raised when some abuse of a L{CodePointSet} is detected."""
47 pass
48
50 """Represent a set of Unicode code points.
51
52 Each code point is an integral value between 0 and 0x10FFFF. This
53 class is used to represent a set of code points in a manner
54 suitable for use as regular expression character sets."""
55
56 MaxCodePoint = 0x10FFFF
57 """The maximum value for a code point in the Unicode code point
58 space. This is normally 0xFFFF, because wide unicode characters
59 are generally not enabled in Python builds. If, however, they are
60 enabled, this will be the full value of 0x10FFFF."""
61
62 MaxShortCodePoint = 0xFFFF
63 if not SupportsWideUnicode:
64 MaxCodePoint = MaxShortCodePoint
65
66
67
68
69
70
71
72
73 __codepoints = None
74
76 """For testing purrposes only, access to the codepoints
77 internal representation."""
78 return self.__codepoints
79
83
93
94
96
97
98 if isinstance(value, tuple):
99 (s, e) = value
100 e += 1
101 elif isinstance(value, basestring):
102 if 1 < len(value):
103 raise TypeError()
104 s = ord(value)
105 e = s+1
106 else:
107 s = int(value)
108 e = s+1
109 if s > e:
110 raise ValueError('codepoint range value order')
111
112
113
114 if s > self.MaxCodePoint:
115 return self
116 if e > self.MaxCodePoint:
117 e = self.MaxCodePoint
118 e = min(e, self.MaxCodePoint)
119
120
121 li = bisect.bisect_left(self.__codepoints, s)
122
123 ri = bisect.bisect_right(self.__codepoints, e)
124
125 case = ((li & 1) << 1) | (ri & 1)
126 if not do_add:
127 case = 3 - case
128
129 if 0x03 == case:
130
131 del self.__codepoints[li:ri]
132 elif 0x02 == case:
133
134 del self.__codepoints[li+1:ri]
135 self.__codepoints[li] = e
136 elif 0x01 == case:
137
138 del self.__codepoints[li+1:ri]
139 self.__codepoints[li] = s
140 else:
141
142 self.__codepoints[li:ri] = [s, e]
143 return self
144
145 - def add (self, value):
146 """Add the given value to the code point set.
147
148 @param value: An integral value denoting a code point, or a
149 tuple C{(s,e)} denoting the start and end (inclusive) code
150 points in a range.
151 @return: C{self}"""
152 return self.__mutate(value, True)
153
155 """Add multiple values to a code point set.
156
157 @param values: Either a L{CodePointSet} instance, or an iterable
158 whose members are valid parameters to L{add}.
159
160 @return: C{self}"""
161 if isinstance(values, CodePointSet):
162 self.extend(values.asTuples())
163 else:
164 [ self.__mutate(_v, True) for _v in values ]
165 return self
166
168 """Remove the given value from the code point set.
169
170 @param value: An integral value denoting a code point, or a tuple
171 C{(s,e)} denoting the start and end (inclusive) code points in a
172 range, or a L{CodePointSet}.
173
174 @return: C{self}"""
175 if isinstance(value, CodePointSet):
176 [ self.subtract(_v) for _v in value.asTuples() ]
177 return self
178 return self.__mutate(value, False)
179
180
181
182 __NotXMLChar_set = frozenset([ '-', '[', ']' ])
183
184
185
187 rv = unichr(code_point)
188 if rv in self.__NotXMLChar_set:
189 rv = u'\\' + rv
190 return rv
191
193 """Return the code point set as Unicode regular expression
194 character group consisting of a sequence of characters or
195 character ranges.
196
197 @param with_brackets: If C{True} (default), square brackets
198 are added to enclose the returned character group."""
199 rva = []
200 if with_brackets:
201 rva.append(u'[')
202 for (s, e) in self.asTuples():
203 if s == e:
204 rva.append(self.__unichr(s))
205 else:
206 rva.extend([self.__unichr(s), '-', self.__unichr(e)])
207 if with_brackets:
208 rva.append(u']')
209 return u''.join(rva)
210
212 """Return the codepoints as tuples denoting the ranges that are in
213 the set.
214
215 Each tuple C{(s, e)} indicates that the code points from C{s}
216 (inclusive) to C{e}) (inclusive) are in the set."""
217
218 rv = []
219 start = None
220 for ri in xrange(len(self.__codepoints)):
221 if start is not None:
222 rv.append( (start, self.__codepoints[ri]-1) )
223 start = None
224 else:
225 start = self.__codepoints[ri]
226 if start is not None:
227 rv.append( (start, self.MaxCodePoint) )
228 return rv
229
239
246
247 from unicode_data import *
248
250 """Regular expression support for XML Schema Data Types.
251
252 This class holds character classes and regular expressions used to
253 constrain the lexical space of XML Schema datatypes derived from
254 U{string<http://www.w3.org/TR/xmlschema-2/#string>}. They are
255 from U{XML 1.0 (Second
256 Edition)<http://www.w3.org/TR/2000/WD-xml-2e-20000814>} and
257 U{Namespaces in XML
258 <http://www.w3.org/TR/1999/REC-xml-names-19990114/>}.
259
260 Unlike the regular expressions used for pattern constraints in XML
261 Schema, which are derived from the Unicode 3.1 specification,
262 these are derived from the Unicode 2.0 specification.
263
264 The XML Schema definition refers explicitly to the second edition
265 of XML, so we have to use these code point sets and patterns. Be
266 aware that U{subsequent updates to the XML specification
267 <http://www.w3.org/XML/xml-V10-4e-errata#E09>} have changed the
268 corresponding patterns for other uses of XML. One significant
269 change is that the original specification, used here, does not
270 allow wide unicode characters."""
271
272 Char = CodePointSet(
273 0x0009,
274 0x000A,
275 0x000D,
276 ( 0x0020, 0xD7FF ),
277 ( 0xE000, 0xFFFD )
278 )
279 if SupportsWideUnicode:
280 Char.extend( ( 0x10000, 0x10FFFF ) )
281
282 BaseChar = CodePointSet(
283 ( 0x0041, 0x005A ),
284 ( 0x0061, 0x007A ),
285 ( 0x00C0, 0x00D6 ),
286 ( 0x00D8, 0x00F6 ),
287 ( 0x00F8, 0x00FF ),
288 ( 0x0100, 0x0131 ),
289 ( 0x0134, 0x013E ),
290 ( 0x0141, 0x0148 ),
291 ( 0x014A, 0x017E ),
292 ( 0x0180, 0x01C3 ),
293 ( 0x01CD, 0x01F0 ),
294 ( 0x01F4, 0x01F5 ),
295 ( 0x01FA, 0x0217 ),
296 ( 0x0250, 0x02A8 ),
297 ( 0x02BB, 0x02C1 ),
298 0x0386,
299 ( 0x0388, 0x038A ),
300 0x038C,
301 ( 0x038E, 0x03A1 ),
302 ( 0x03A3, 0x03CE ),
303 ( 0x03D0, 0x03D6 ),
304 0x03DA,
305 0x03DC,
306 0x03DE,
307 0x03E0,
308 ( 0x03E2, 0x03F3 ),
309 ( 0x0401, 0x040C ),
310 ( 0x040E, 0x044F ),
311 ( 0x0451, 0x045C ),
312 ( 0x045E, 0x0481 ),
313 ( 0x0490, 0x04C4 ),
314 ( 0x04C7, 0x04C8 ),
315 ( 0x04CB, 0x04CC ),
316 ( 0x04D0, 0x04EB ),
317 ( 0x04EE, 0x04F5 ),
318 ( 0x04F8, 0x04F9 ),
319 ( 0x0531, 0x0556 ),
320 0x0559,
321 ( 0x0561, 0x0586 ),
322 ( 0x05D0, 0x05EA ),
323 ( 0x05F0, 0x05F2 ),
324 ( 0x0621, 0x063A ),
325 ( 0x0641, 0x064A ),
326 ( 0x0671, 0x06B7 ),
327 ( 0x06BA, 0x06BE ),
328 ( 0x06C0, 0x06CE ),
329 ( 0x06D0, 0x06D3 ),
330 0x06D5,
331 ( 0x06E5, 0x06E6 ),
332 ( 0x0905, 0x0939 ),
333 0x093D,
334 ( 0x0958, 0x0961 ),
335 ( 0x0985, 0x098C ),
336 ( 0x098F, 0x0990 ),
337 ( 0x0993, 0x09A8 ),
338 ( 0x09AA, 0x09B0 ),
339 0x09B2,
340 ( 0x09B6, 0x09B9 ),
341 ( 0x09DC, 0x09DD ),
342 ( 0x09DF, 0x09E1 ),
343 ( 0x09F0, 0x09F1 ),
344 ( 0x0A05, 0x0A0A ),
345 ( 0x0A0F, 0x0A10 ),
346 ( 0x0A13, 0x0A28 ),
347 ( 0x0A2A, 0x0A30 ),
348 ( 0x0A32, 0x0A33 ),
349 ( 0x0A35, 0x0A36 ),
350 ( 0x0A38, 0x0A39 ),
351 ( 0x0A59, 0x0A5C ),
352 0x0A5E,
353 ( 0x0A72, 0x0A74 ),
354 ( 0x0A85, 0x0A8B ),
355 0x0A8D,
356 ( 0x0A8F, 0x0A91 ),
357 ( 0x0A93, 0x0AA8 ),
358 ( 0x0AAA, 0x0AB0 ),
359 ( 0x0AB2, 0x0AB3 ),
360 ( 0x0AB5, 0x0AB9 ),
361 0x0ABD,
362 0x0AE0,
363 ( 0x0B05, 0x0B0C ),
364 ( 0x0B0F, 0x0B10 ),
365 ( 0x0B13, 0x0B28 ),
366 ( 0x0B2A, 0x0B30 ),
367 ( 0x0B32, 0x0B33 ),
368 ( 0x0B36, 0x0B39 ),
369 0x0B3D,
370 ( 0x0B5C, 0x0B5D ),
371 ( 0x0B5F, 0x0B61 ),
372 ( 0x0B85, 0x0B8A ),
373 ( 0x0B8E, 0x0B90 ),
374 ( 0x0B92, 0x0B95 ),
375 ( 0x0B99, 0x0B9A ),
376 0x0B9C,
377 ( 0x0B9E, 0x0B9F ),
378 ( 0x0BA3, 0x0BA4 ),
379 ( 0x0BA8, 0x0BAA ),
380 ( 0x0BAE, 0x0BB5 ),
381 ( 0x0BB7, 0x0BB9 ),
382 ( 0x0C05, 0x0C0C ),
383 ( 0x0C0E, 0x0C10 ),
384 ( 0x0C12, 0x0C28 ),
385 ( 0x0C2A, 0x0C33 ),
386 ( 0x0C35, 0x0C39 ),
387 ( 0x0C60, 0x0C61 ),
388 ( 0x0C85, 0x0C8C ),
389 ( 0x0C8E, 0x0C90 ),
390 ( 0x0C92, 0x0CA8 ),
391 ( 0x0CAA, 0x0CB3 ),
392 ( 0x0CB5, 0x0CB9 ),
393 0x0CDE,
394 ( 0x0CE0, 0x0CE1 ),
395 ( 0x0D05, 0x0D0C ),
396 ( 0x0D0E, 0x0D10 ),
397 ( 0x0D12, 0x0D28 ),
398 ( 0x0D2A, 0x0D39 ),
399 ( 0x0D60, 0x0D61 ),
400 ( 0x0E01, 0x0E2E ),
401 0x0E30,
402 ( 0x0E32, 0x0E33 ),
403 ( 0x0E40, 0x0E45 ),
404 ( 0x0E81, 0x0E82 ),
405 0x0E84,
406 ( 0x0E87, 0x0E88 ),
407 0x0E8A,
408 0x0E8D,
409 ( 0x0E94, 0x0E97 ),
410 ( 0x0E99, 0x0E9F ),
411 ( 0x0EA1, 0x0EA3 ),
412 0x0EA5,
413 0x0EA7,
414 ( 0x0EAA, 0x0EAB ),
415 ( 0x0EAD, 0x0EAE ),
416 0x0EB0,
417 ( 0x0EB2, 0x0EB3 ),
418 0x0EBD,
419 ( 0x0EC0, 0x0EC4 ),
420 ( 0x0F40, 0x0F47 ),
421 ( 0x0F49, 0x0F69 ),
422 ( 0x10A0, 0x10C5 ),
423 ( 0x10D0, 0x10F6 ),
424 0x1100,
425 ( 0x1102, 0x1103 ),
426 ( 0x1105, 0x1107 ),
427 0x1109,
428 ( 0x110B, 0x110C ),
429 ( 0x110E, 0x1112 ),
430 0x113C,
431 0x113E,
432 0x1140,
433 0x114C,
434 0x114E,
435 0x1150,
436 ( 0x1154, 0x1155 ),
437 0x1159,
438 ( 0x115F, 0x1161 ),
439 0x1163,
440 0x1165,
441 0x1167,
442 0x1169,
443 ( 0x116D, 0x116E ),
444 ( 0x1172, 0x1173 ),
445 0x1175,
446 0x119E,
447 0x11A8,
448 0x11AB,
449 ( 0x11AE, 0x11AF ),
450 ( 0x11B7, 0x11B8 ),
451 0x11BA,
452 ( 0x11BC, 0x11C2 ),
453 0x11EB,
454 0x11F0,
455 0x11F9,
456 ( 0x1E00, 0x1E9B ),
457 ( 0x1EA0, 0x1EF9 ),
458 ( 0x1F00, 0x1F15 ),
459 ( 0x1F18, 0x1F1D ),
460 ( 0x1F20, 0x1F45 ),
461 ( 0x1F48, 0x1F4D ),
462 ( 0x1F50, 0x1F57 ),
463 0x1F59,
464 0x1F5B,
465 0x1F5D,
466 ( 0x1F5F, 0x1F7D ),
467 ( 0x1F80, 0x1FB4 ),
468 ( 0x1FB6, 0x1FBC ),
469 0x1FBE,
470 ( 0x1FC2, 0x1FC4 ),
471 ( 0x1FC6, 0x1FCC ),
472 ( 0x1FD0, 0x1FD3 ),
473 ( 0x1FD6, 0x1FDB ),
474 ( 0x1FE0, 0x1FEC ),
475 ( 0x1FF2, 0x1FF4 ),
476 ( 0x1FF6, 0x1FFC ),
477 0x2126,
478 ( 0x212A, 0x212B ),
479 0x212E,
480 ( 0x2180, 0x2182 ),
481 ( 0x3041, 0x3094 ),
482 ( 0x30A1, 0x30FA ),
483 ( 0x3105, 0x312C ),
484 ( 0xAC00, 0xD7A3 )
485 )
486
487 Ideographic = CodePointSet(
488 ( 0x4E00, 0x9FA5 ),
489 0x3007,
490 ( 0x3021, 0x3029 )
491 )
492
493 Letter = CodePointSet(BaseChar).extend(Ideographic);
494
495 CombiningChar = CodePointSet(
496 ( 0x0300, 0x0345 ),
497 ( 0x0360, 0x0361 ),
498 ( 0x0483, 0x0486 ),
499 ( 0x0591, 0x05A1 ),
500 ( 0x05A3, 0x05B9 ),
501 ( 0x05BB, 0x05BD ),
502 0x05BF,
503 ( 0x05C1, 0x05C2 ),
504 0x05C4,
505 ( 0x064B, 0x0652 ),
506 0x0670,
507 ( 0x06D6, 0x06DC ),
508 ( 0x06DD, 0x06DF ),
509 ( 0x06E0, 0x06E4 ),
510 ( 0x06E7, 0x06E8 ),
511 ( 0x06EA, 0x06ED ),
512 ( 0x0901, 0x0903 ),
513 0x093C,
514 ( 0x093E, 0x094C ),
515 0x094D,
516 ( 0x0951, 0x0954 ),
517 ( 0x0962, 0x0963 ),
518 ( 0x0981, 0x0983 ),
519 0x09BC,
520 0x09BE,
521 0x09BF,
522 ( 0x09C0, 0x09C4 ),
523 ( 0x09C7, 0x09C8 ),
524 ( 0x09CB, 0x09CD ),
525 0x09D7,
526 ( 0x09E2, 0x09E3 ),
527 0x0A02,
528 0x0A3C,
529 0x0A3E,
530 0x0A3F,
531 ( 0x0A40, 0x0A42 ),
532 ( 0x0A47, 0x0A48 ),
533 ( 0x0A4B, 0x0A4D ),
534 ( 0x0A70, 0x0A71 ),
535 ( 0x0A81, 0x0A83 ),
536 0x0ABC,
537 ( 0x0ABE, 0x0AC5 ),
538 ( 0x0AC7, 0x0AC9 ),
539 ( 0x0ACB, 0x0ACD ),
540 ( 0x0B01, 0x0B03 ),
541 0x0B3C,
542 ( 0x0B3E, 0x0B43 ),
543 ( 0x0B47, 0x0B48 ),
544 ( 0x0B4B, 0x0B4D ),
545 ( 0x0B56, 0x0B57 ),
546 ( 0x0B82, 0x0B83 ),
547 ( 0x0BBE, 0x0BC2 ),
548 ( 0x0BC6, 0x0BC8 ),
549 ( 0x0BCA, 0x0BCD ),
550 0x0BD7,
551 ( 0x0C01, 0x0C03 ),
552 ( 0x0C3E, 0x0C44 ),
553 ( 0x0C46, 0x0C48 ),
554 ( 0x0C4A, 0x0C4D ),
555 ( 0x0C55, 0x0C56 ),
556 ( 0x0C82, 0x0C83 ),
557 ( 0x0CBE, 0x0CC4 ),
558 ( 0x0CC6, 0x0CC8 ),
559 ( 0x0CCA, 0x0CCD ),
560 ( 0x0CD5, 0x0CD6 ),
561 ( 0x0D02, 0x0D03 ),
562 ( 0x0D3E, 0x0D43 ),
563 ( 0x0D46, 0x0D48 ),
564 ( 0x0D4A, 0x0D4D ),
565 0x0D57,
566 0x0E31,
567 ( 0x0E34, 0x0E3A ),
568 ( 0x0E47, 0x0E4E ),
569 0x0EB1,
570 ( 0x0EB4, 0x0EB9 ),
571 ( 0x0EBB, 0x0EBC ),
572 ( 0x0EC8, 0x0ECD ),
573 ( 0x0F18, 0x0F19 ),
574 0x0F35,
575 0x0F37,
576 0x0F39,
577 0x0F3E,
578 0x0F3F,
579 ( 0x0F71, 0x0F84 ),
580 ( 0x0F86, 0x0F8B ),
581 ( 0x0F90, 0x0F95 ),
582 0x0F97,
583 ( 0x0F99, 0x0FAD ),
584 ( 0x0FB1, 0x0FB7 ),
585 0x0FB9,
586 ( 0x20D0, 0x20DC ),
587 0x20E1,
588 ( 0x302A, 0x302F ),
589 0x3099,
590 0x309A
591 )
592
593 Digit = CodePointSet(
594 ( 0x0030, 0x0039 ),
595 ( 0x0660, 0x0669 ),
596 ( 0x06F0, 0x06F9 ),
597 ( 0x0966, 0x096F ),
598 ( 0x09E6, 0x09EF ),
599 ( 0x0A66, 0x0A6F ),
600 ( 0x0AE6, 0x0AEF ),
601 ( 0x0B66, 0x0B6F ),
602 ( 0x0BE7, 0x0BEF ),
603 ( 0x0C66, 0x0C6F ),
604 ( 0x0CE6, 0x0CEF ),
605 ( 0x0D66, 0x0D6F ),
606 ( 0x0E50, 0x0E59 ),
607 ( 0x0ED0, 0x0ED9 ),
608 ( 0x0F20, 0x0F29 )
609 )
610
611 Extender = CodePointSet(
612 0x00B7,
613 0x02D0,
614 0x02D1,
615 0x0387,
616 0x0640,
617 0x0E46,
618 0x0EC6,
619 0x3005,
620 ( 0x3031, 0x3035 ),
621 ( 0x309D, 0x309E ),
622 ( 0x30FC, 0x30FE )
623 )
624
625
626 NameStartChar = CodePointSet(Letter)
627 NameStartChar.add(ord('_'))
628 NameStartChar.add(ord(':'))
629
630 NCNameStartChar = CodePointSet(Letter)
631 NCNameStartChar.add(ord('_'))
632
633 NameChar = CodePointSet(Letter)
634 NameChar.extend(Digit)
635 NameChar.add(ord('.'))
636 NameChar.add(ord('-'))
637 NameChar.add(ord('_'))
638 NameChar.add(ord(':'))
639 NameChar.extend(CombiningChar)
640 NameChar.extend(Extender)
641
642 NCNameChar = CodePointSet(Letter)
643 NCNameChar.extend(Digit)
644 NCNameChar.add(ord('.'))
645 NCNameChar.add(ord('-'))
646 NCNameChar.add(ord('_'))
647 NCNameChar.extend(CombiningChar)
648 NCNameChar.extend(Extender)
649
650 Name_pat = '%s%s*' % (NameStartChar.asPattern(), NameChar.asPattern())
651 Name_re = re.compile('^%s$' % (Name_pat,))
652 NmToken_pat = '%s+' % (NameChar.asPattern(),)
653 NmToken_re = re.compile('^%s$' % (NmToken_pat,))
654 NCName_pat = '%s%s*' % (NCNameStartChar.asPattern(), NCNameChar.asPattern())
655 NCName_re = re.compile('^%s$' % (NCName_pat,))
656 QName_pat = '(%s:)?%s' % (NCName_pat, NCName_pat)
657 QName_re = re.compile('^%s$' % (QName_pat,))
658
659
660 SingleCharEsc = { 'n' : CodePointSet(0x0A),
661 'r' : CodePointSet(0x0D),
662 't' : CodePointSet(0x09) }
663 for c in r'\|.-^?*+{}()[]':
664 SingleCharEsc[c] = CodePointSet(ord(c))
665
666
667 WildcardEsc = CodePointSet(ord('\n'), ord('\r')).negate()
668 MultiCharEsc = { }
669 MultiCharEsc['s'] = CodePointSet(0x20, ord('\t'), ord('\n'), ord('\r'))
670 MultiCharEsc['S'] = MultiCharEsc['s'].negate()
671 MultiCharEsc['i'] = CodePointSet(XML1p0e2.Letter).add(ord('_')).add(ord(':'))
672 MultiCharEsc['I'] = MultiCharEsc['i'].negate()
673 MultiCharEsc['c'] = CodePointSet(XML1p0e2.NameChar)
674 MultiCharEsc['C'] = MultiCharEsc['c'].negate()
675 MultiCharEsc['d'] = PropertyMap['Nd']
676 MultiCharEsc['D'] = MultiCharEsc['d'].negate()
677 MultiCharEsc['W'] = CodePointSet(PropertyMap['P']).extend(PropertyMap['Z']).extend(PropertyMap['C'])
678 MultiCharEsc['w'] = MultiCharEsc['W'].negate()
679