Package pyxb :: Package utils :: Module unicode
[hide private]
[frames] | no frames]

Source Code for Module pyxb.utils.unicode

  1  # -*- coding: utf-8 -*- 
  2  # Copyright 2009-2012, Peter A. Bigot 
  3  # 
  4  # Licensed under the Apache License, Version 2.0 (the "License"); you may 
  5  # not use this file except in compliance with the License. You may obtain a 
  6  # copy of the License at: 
  7  # 
  8  #            http://www.apache.org/licenses/LICENSE-2.0 
  9  # 
 10  # Unless required by applicable law or agreed to in writing, software 
 11  # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 
 12  # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 
 13  # License for the specific language governing permissions and limitations 
 14  # under the License. 
 15   
 16  """This module contains support for Unicode characters as required to 
 17  support the regular expression syntax defined in U{annex F 
 18  <http://www/Documentation/W3C/www.w3.org/TR/xmlschema-2/index.html#regexs>} 
 19  of the XML Schema definition. 
 20   
 21  In particular, we need to be able to identify character properties and 
 22  block escapes, as defined in F.1.1, by name. 
 23   
 24   - Block data: U{http://www.unicode.org/Public/3.1-Update/Blocks-4.txt} 
 25   - Property list data: U{http://www.unicode.org/Public/3.1-Update/PropList-3.1.0.txt} 
 26   - Full dataset: U{http://www.unicode.org/Public/3.1-Update/UnicodeData-3.1.0.txt} 
 27   
 28  The Unicode database active at the time XML Schema 1.0 was defined is 
 29  archived at 
 30  U{http://www.unicode.org/Public/3.1-Update/UnicodeCharacterDatabase-3.1.0.html}, 
 31  and refers to U{Unicode Standard Annex #27: Unicode 3.1 
 32  <http://www.unicode.org/unicode/reports/tr27/>}. 
 33  """ 
 34   
 35  import re 
 36  import logging 
 37   
 38  _log = logging.getLogger(__name__) 
 39   
 40  SupportsWideUnicode = False 
 41  try: 
 42      re.compile(u'[\U0001d7ce-\U0001d7ff]') 
 43      SupportsWideUnicode = True 
 44  except: 
 45      pass 
 46   
 47  import bisect 
 48           
49 -class CodePointSetError (LookupError):
50 """Raised when some abuse of a L{CodePointSet} is detected.""" 51 pass
52
53 -class CodePointSet (object):
54 """Represent a set of Unicode code points. 55 56 Each code point is an integral value between 0 and 0x10FFFF. This 57 class is used to represent a set of code points in a manner 58 suitable for use as regular expression character sets.""" 59 60 MaxCodePoint = 0x10FFFF 61 """The maximum value for a code point in the Unicode code point 62 space. This is normally 0xFFFF, because wide unicode characters 63 are generally not enabled in Python builds. If, however, they are 64 enabled, this will be the full value of 0x10FFFF.""" 65 66 MaxShortCodePoint = 0xFFFF 67 if not SupportsWideUnicode: 68 MaxCodePoint = MaxShortCodePoint 69 70 # The internal representation of the codepoints is as a sorted 71 # list where values at an even index denote the first codepoint in 72 # a range that is in the set, and the immediately following value 73 # indicates the next following codepoint that is not in the set. 74 # A missing value at the end is interpreted as MaxCodePoint. For 75 # example, the sequence [ 12, 15, 200 ] denotes the set containing 76 # codepoints 12, 13, 14, and everything above 199. 77 __codepoints = None 78
79 - def _codepoints (self):
80 """For testing purrposes only, access to the codepoints 81 internal representation.""" 82 return self.__codepoints
83
84 - def __cmp__ (self, other):
85 """Equality is delegated to the codepoints list.""" 86 return cmp(self.__codepoints, other.__codepoints)
87
88 - def __init__ (self, *args):
89 self.__codepoints = [] 90 if 1 == len(args): 91 if isinstance(args[0], CodePointSet): 92 self.__codepoints.extend(args[0].__codepoints) 93 return 94 if isinstance(args[0], list): 95 args = args[0] 96 for a in args: 97 self.add(a)
98
99 - def __mutate (self, value, do_add):
100 # Identify the start (inclusive) and end (exclusive) code 101 # points of the value's range. 102 if isinstance(value, tuple): 103 (s, e) = value 104 e += 1 105 elif isinstance(value, basestring): 106 if 1 < len(value): 107 raise TypeError() 108 s = ord(value) 109 e = s+1 110 else: 111 s = int(value) 112 e = s+1 113 if s > e: 114 raise ValueError('codepoint range value order') 115 116 # Validate the range for the code points supported by this 117 # Python interpreter 118 if s > self.MaxCodePoint: 119 return self 120 if e > self.MaxCodePoint: 121 e = self.MaxCodePoint 122 e = min(e, self.MaxCodePoint) 123 124 # Index of first code point equal to or greater than s 125 li = bisect.bisect_left(self.__codepoints, s) 126 # Index of last code point less than or equal to e 127 ri = bisect.bisect_right(self.__codepoints, e) 128 # There are four cases; if we're subtracting, they reflect. 129 case = ((li & 1) << 1) | (ri & 1) 130 if not do_add: 131 case = 3 - case 132 if 0x03 == case: 133 # Add: Incoming value begins and ends within existing ranges 134 del self.__codepoints[li:ri] 135 elif 0x02 == case: 136 # Add: Incoming value extends into an excluded range 137 del self.__codepoints[li+1:ri] 138 self.__codepoints[li] = e 139 elif 0x01 == case: 140 # Add: Incoming value begins in an excluded range 141 del self.__codepoints[li+1:ri] 142 self.__codepoints[li] = s 143 else: 144 # Add: Incoming value begins and ends within excluded ranges 145 self.__codepoints[li:ri] = [s, e] 146 return self
147
148 - def add (self, value):
149 """Add the given value to the code point set. 150 151 @param value: An integral value denoting a code point, or a 152 tuple C{(s,e)} denoting the start and end (inclusive) code 153 points in a range. 154 @return: C{self}""" 155 return self.__mutate(value, True)
156
157 - def extend (self, values):
158 """Add multiple values to a code point set. 159 160 @param values: Either a L{CodePointSet} instance, or an iterable 161 whose members are valid parameters to L{add}. 162 163 @return: C{self}""" 164 if isinstance(values, CodePointSet): 165 self.extend(values.asTuples()) 166 else: 167 for v in values: 168 self.__mutate(v, True) 169 return self
170
171 - def subtract (self, value):
172 """Remove the given value from the code point set. 173 174 @param value: An integral value denoting a code point, or a tuple 175 C{(s,e)} denoting the start and end (inclusive) code points in a 176 range, or a L{CodePointSet}. 177 178 @return: C{self}""" 179 if isinstance(value, CodePointSet): 180 for v in value.asTuples(): 181 self.subtract(v) 182 return self 183 return self.__mutate(value, False)
184 185 # Escape sequences for characters that must not appear unescaped in 186 # Python regular expression patterns. Maps each bad character to a safe 187 # escape sequence. 188 __XMLtoPythonREMap = { 189 u'\x00': u'\\x00', # From docs for Python's "re" module: Regular 190 # expression pattern strings may not contain null 191 # bytes 192 u'^': u'\\^', # Indicates negation if it happens to occur at the 193 # start of a character group 194 u'\\': u'\\\\', # Escape character 195 u'[': u'\\[', # Actually doesn't need to be escaped inside a Python 196 # character group, but escaping it is less confusing. 197 u']': u'\\]', # End of character group 198 u'-': u'\\-', # Indicates a range of characters 199 } 200 201 # Return the given code point as a unicode character suitable for 202 # use in a regular expression
203 - def __unichr (self, code_point):
204 rv = unichr(code_point) 205 rv = self.__XMLtoPythonREMap.get(rv, rv) 206 return rv
207
208 - def asPattern (self, with_brackets=True):
209 """Return the code point set as Unicode regular expression 210 character group consisting of a sequence of characters or 211 character ranges. 212 213 This returns a regular expression fragment using Python's 214 regular expression syntax. Note that different regular expression 215 syntaxes are not compatible, often in subtle ways. 216 217 @param with_brackets: If C{True} (default), square brackets 218 are added to enclose the returned character group.""" 219 rva = [] 220 if with_brackets: 221 rva.append(u'[') 222 for (s, e) in self.asTuples(): 223 if s == e: 224 rva.append(self.__unichr(s)) 225 else: 226 rva.extend([self.__unichr(s), '-', self.__unichr(e)]) 227 if with_brackets: 228 rva.append(u']') 229 return u''.join(rva)
230
231 - def asTuples (self):
232 """Return the codepoints as tuples denoting the ranges that are in 233 the set. 234 235 Each tuple C{(s, e)} indicates that the code points from C{s} 236 (inclusive) to C{e}) (inclusive) are in the set.""" 237 238 rv = [] 239 start = None 240 for ri in xrange(len(self.__codepoints)): 241 if start is not None: 242 rv.append( (start, self.__codepoints[ri]-1) ) 243 start = None 244 else: 245 start = self.__codepoints[ri] 246 if start is not None: 247 rv.append( (start, self.MaxCodePoint) ) 248 return rv
249
250 - def negate (self):
251 """Return an instance that represents the inverse of this set.""" 252 rv = type(self)() 253 if (0 < len(self.__codepoints)) and (0 == self.__codepoints[0]): 254 rv.__codepoints.extend(self.__codepoints[1:]) 255 else: 256 rv.__codepoints.append(0) 257 rv.__codepoints.extend(self.__codepoints) 258 return rv
259
260 - def asSingleCharacter (self):
261 """If this set represents a single character, return it as its 262 unicode string value. Otherwise return C{None}.""" 263 if (2 != len(self.__codepoints)) or (1 < (self.__codepoints[1] - self.__codepoints[0])): 264 return None 265 return unichr(self.__codepoints[0])
266 267 from pyxb.utils.unicode_data import PropertyMap 268 from pyxb.utils.unicode_data import BlockMap 269
270 -class XML1p0e2 (object):
271 """Regular expression support for XML Schema Data Types. 272 273 This class holds character classes and regular expressions used to 274 constrain the lexical space of XML Schema datatypes derived from 275 U{string<http://www.w3.org/TR/xmlschema-2/#string>}. They are 276 from U{XML 1.0 (Second 277 Edition)<http://www.w3.org/TR/2000/WD-xml-2e-20000814>} and 278 U{Namespaces in XML 279 <http://www.w3.org/TR/1999/REC-xml-names-19990114/>}. 280 281 Unlike the regular expressions used for pattern constraints in XML 282 Schema, which are derived from the Unicode 3.1 specification, 283 these are derived from the Unicode 2.0 specification. 284 285 The XML Schema definition refers explicitly to the second edition 286 of XML, so we have to use these code point sets and patterns. Be 287 aware that U{subsequent updates to the XML specification 288 <http://www.w3.org/XML/xml-V10-4e-errata#E09>} have changed the 289 corresponding patterns for other uses of XML. One significant 290 change is that the original specification, used here, does not 291 allow wide unicode characters.""" 292 293 Char = CodePointSet( 294 0x0009, 295 0x000A, 296 0x000D, 297 ( 0x0020, 0xD7FF ), 298 ( 0xE000, 0xFFFD ) 299 ) 300 if SupportsWideUnicode: 301 Char.extend( ( 0x10000, 0x10FFFF ) ) 302 303 BaseChar = CodePointSet( 304 ( 0x0041, 0x005A ), 305 ( 0x0061, 0x007A ), 306 ( 0x00C0, 0x00D6 ), 307 ( 0x00D8, 0x00F6 ), 308 ( 0x00F8, 0x00FF ), 309 ( 0x0100, 0x0131 ), 310 ( 0x0134, 0x013E ), 311 ( 0x0141, 0x0148 ), 312 ( 0x014A, 0x017E ), 313 ( 0x0180, 0x01C3 ), 314 ( 0x01CD, 0x01F0 ), 315 ( 0x01F4, 0x01F5 ), 316 ( 0x01FA, 0x0217 ), 317 ( 0x0250, 0x02A8 ), 318 ( 0x02BB, 0x02C1 ), 319 0x0386, 320 ( 0x0388, 0x038A ), 321 0x038C, 322 ( 0x038E, 0x03A1 ), 323 ( 0x03A3, 0x03CE ), 324 ( 0x03D0, 0x03D6 ), 325 0x03DA, 326 0x03DC, 327 0x03DE, 328 0x03E0, 329 ( 0x03E2, 0x03F3 ), 330 ( 0x0401, 0x040C ), 331 ( 0x040E, 0x044F ), 332 ( 0x0451, 0x045C ), 333 ( 0x045E, 0x0481 ), 334 ( 0x0490, 0x04C4 ), 335 ( 0x04C7, 0x04C8 ), 336 ( 0x04CB, 0x04CC ), 337 ( 0x04D0, 0x04EB ), 338 ( 0x04EE, 0x04F5 ), 339 ( 0x04F8, 0x04F9 ), 340 ( 0x0531, 0x0556 ), 341 0x0559, 342 ( 0x0561, 0x0586 ), 343 ( 0x05D0, 0x05EA ), 344 ( 0x05F0, 0x05F2 ), 345 ( 0x0621, 0x063A ), 346 ( 0x0641, 0x064A ), 347 ( 0x0671, 0x06B7 ), 348 ( 0x06BA, 0x06BE ), 349 ( 0x06C0, 0x06CE ), 350 ( 0x06D0, 0x06D3 ), 351 0x06D5, 352 ( 0x06E5, 0x06E6 ), 353 ( 0x0905, 0x0939 ), 354 0x093D, 355 ( 0x0958, 0x0961 ), 356 ( 0x0985, 0x098C ), 357 ( 0x098F, 0x0990 ), 358 ( 0x0993, 0x09A8 ), 359 ( 0x09AA, 0x09B0 ), 360 0x09B2, 361 ( 0x09B6, 0x09B9 ), 362 ( 0x09DC, 0x09DD ), 363 ( 0x09DF, 0x09E1 ), 364 ( 0x09F0, 0x09F1 ), 365 ( 0x0A05, 0x0A0A ), 366 ( 0x0A0F, 0x0A10 ), 367 ( 0x0A13, 0x0A28 ), 368 ( 0x0A2A, 0x0A30 ), 369 ( 0x0A32, 0x0A33 ), 370 ( 0x0A35, 0x0A36 ), 371 ( 0x0A38, 0x0A39 ), 372 ( 0x0A59, 0x0A5C ), 373 0x0A5E, 374 ( 0x0A72, 0x0A74 ), 375 ( 0x0A85, 0x0A8B ), 376 0x0A8D, 377 ( 0x0A8F, 0x0A91 ), 378 ( 0x0A93, 0x0AA8 ), 379 ( 0x0AAA, 0x0AB0 ), 380 ( 0x0AB2, 0x0AB3 ), 381 ( 0x0AB5, 0x0AB9 ), 382 0x0ABD, 383 0x0AE0, 384 ( 0x0B05, 0x0B0C ), 385 ( 0x0B0F, 0x0B10 ), 386 ( 0x0B13, 0x0B28 ), 387 ( 0x0B2A, 0x0B30 ), 388 ( 0x0B32, 0x0B33 ), 389 ( 0x0B36, 0x0B39 ), 390 0x0B3D, 391 ( 0x0B5C, 0x0B5D ), 392 ( 0x0B5F, 0x0B61 ), 393 ( 0x0B85, 0x0B8A ), 394 ( 0x0B8E, 0x0B90 ), 395 ( 0x0B92, 0x0B95 ), 396 ( 0x0B99, 0x0B9A ), 397 0x0B9C, 398 ( 0x0B9E, 0x0B9F ), 399 ( 0x0BA3, 0x0BA4 ), 400 ( 0x0BA8, 0x0BAA ), 401 ( 0x0BAE, 0x0BB5 ), 402 ( 0x0BB7, 0x0BB9 ), 403 ( 0x0C05, 0x0C0C ), 404 ( 0x0C0E, 0x0C10 ), 405 ( 0x0C12, 0x0C28 ), 406 ( 0x0C2A, 0x0C33 ), 407 ( 0x0C35, 0x0C39 ), 408 ( 0x0C60, 0x0C61 ), 409 ( 0x0C85, 0x0C8C ), 410 ( 0x0C8E, 0x0C90 ), 411 ( 0x0C92, 0x0CA8 ), 412 ( 0x0CAA, 0x0CB3 ), 413 ( 0x0CB5, 0x0CB9 ), 414 0x0CDE, 415 ( 0x0CE0, 0x0CE1 ), 416 ( 0x0D05, 0x0D0C ), 417 ( 0x0D0E, 0x0D10 ), 418 ( 0x0D12, 0x0D28 ), 419 ( 0x0D2A, 0x0D39 ), 420 ( 0x0D60, 0x0D61 ), 421 ( 0x0E01, 0x0E2E ), 422 0x0E30, 423 ( 0x0E32, 0x0E33 ), 424 ( 0x0E40, 0x0E45 ), 425 ( 0x0E81, 0x0E82 ), 426 0x0E84, 427 ( 0x0E87, 0x0E88 ), 428 0x0E8A, 429 0x0E8D, 430 ( 0x0E94, 0x0E97 ), 431 ( 0x0E99, 0x0E9F ), 432 ( 0x0EA1, 0x0EA3 ), 433 0x0EA5, 434 0x0EA7, 435 ( 0x0EAA, 0x0EAB ), 436 ( 0x0EAD, 0x0EAE ), 437 0x0EB0, 438 ( 0x0EB2, 0x0EB3 ), 439 0x0EBD, 440 ( 0x0EC0, 0x0EC4 ), 441 ( 0x0F40, 0x0F47 ), 442 ( 0x0F49, 0x0F69 ), 443 ( 0x10A0, 0x10C5 ), 444 ( 0x10D0, 0x10F6 ), 445 0x1100, 446 ( 0x1102, 0x1103 ), 447 ( 0x1105, 0x1107 ), 448 0x1109, 449 ( 0x110B, 0x110C ), 450 ( 0x110E, 0x1112 ), 451 0x113C, 452 0x113E, 453 0x1140, 454 0x114C, 455 0x114E, 456 0x1150, 457 ( 0x1154, 0x1155 ), 458 0x1159, 459 ( 0x115F, 0x1161 ), 460 0x1163, 461 0x1165, 462 0x1167, 463 0x1169, 464 ( 0x116D, 0x116E ), 465 ( 0x1172, 0x1173 ), 466 0x1175, 467 0x119E, 468 0x11A8, 469 0x11AB, 470 ( 0x11AE, 0x11AF ), 471 ( 0x11B7, 0x11B8 ), 472 0x11BA, 473 ( 0x11BC, 0x11C2 ), 474 0x11EB, 475 0x11F0, 476 0x11F9, 477 ( 0x1E00, 0x1E9B ), 478 ( 0x1EA0, 0x1EF9 ), 479 ( 0x1F00, 0x1F15 ), 480 ( 0x1F18, 0x1F1D ), 481 ( 0x1F20, 0x1F45 ), 482 ( 0x1F48, 0x1F4D ), 483 ( 0x1F50, 0x1F57 ), 484 0x1F59, 485 0x1F5B, 486 0x1F5D, 487 ( 0x1F5F, 0x1F7D ), 488 ( 0x1F80, 0x1FB4 ), 489 ( 0x1FB6, 0x1FBC ), 490 0x1FBE, 491 ( 0x1FC2, 0x1FC4 ), 492 ( 0x1FC6, 0x1FCC ), 493 ( 0x1FD0, 0x1FD3 ), 494 ( 0x1FD6, 0x1FDB ), 495 ( 0x1FE0, 0x1FEC ), 496 ( 0x1FF2, 0x1FF4 ), 497 ( 0x1FF6, 0x1FFC ), 498 0x2126, 499 ( 0x212A, 0x212B ), 500 0x212E, 501 ( 0x2180, 0x2182 ), 502 ( 0x3041, 0x3094 ), 503 ( 0x30A1, 0x30FA ), 504 ( 0x3105, 0x312C ), 505 ( 0xAC00, 0xD7A3 ) 506 ) 507 508 Ideographic = CodePointSet( 509 ( 0x4E00, 0x9FA5 ), 510 0x3007, 511 ( 0x3021, 0x3029 ) 512 ) 513 514 Letter = CodePointSet(BaseChar).extend(Ideographic) 515 516 CombiningChar = CodePointSet( 517 ( 0x0300, 0x0345 ), 518 ( 0x0360, 0x0361 ), 519 ( 0x0483, 0x0486 ), 520 ( 0x0591, 0x05A1 ), 521 ( 0x05A3, 0x05B9 ), 522 ( 0x05BB, 0x05BD ), 523 0x05BF, 524 ( 0x05C1, 0x05C2 ), 525 0x05C4, 526 ( 0x064B, 0x0652 ), 527 0x0670, 528 ( 0x06D6, 0x06DC ), 529 ( 0x06DD, 0x06DF ), 530 ( 0x06E0, 0x06E4 ), 531 ( 0x06E7, 0x06E8 ), 532 ( 0x06EA, 0x06ED ), 533 ( 0x0901, 0x0903 ), 534 0x093C, 535 ( 0x093E, 0x094C ), 536 0x094D, 537 ( 0x0951, 0x0954 ), 538 ( 0x0962, 0x0963 ), 539 ( 0x0981, 0x0983 ), 540 0x09BC, 541 0x09BE, 542 0x09BF, 543 ( 0x09C0, 0x09C4 ), 544 ( 0x09C7, 0x09C8 ), 545 ( 0x09CB, 0x09CD ), 546 0x09D7, 547 ( 0x09E2, 0x09E3 ), 548 0x0A02, 549 0x0A3C, 550 0x0A3E, 551 0x0A3F, 552 ( 0x0A40, 0x0A42 ), 553 ( 0x0A47, 0x0A48 ), 554 ( 0x0A4B, 0x0A4D ), 555 ( 0x0A70, 0x0A71 ), 556 ( 0x0A81, 0x0A83 ), 557 0x0ABC, 558 ( 0x0ABE, 0x0AC5 ), 559 ( 0x0AC7, 0x0AC9 ), 560 ( 0x0ACB, 0x0ACD ), 561 ( 0x0B01, 0x0B03 ), 562 0x0B3C, 563 ( 0x0B3E, 0x0B43 ), 564 ( 0x0B47, 0x0B48 ), 565 ( 0x0B4B, 0x0B4D ), 566 ( 0x0B56, 0x0B57 ), 567 ( 0x0B82, 0x0B83 ), 568 ( 0x0BBE, 0x0BC2 ), 569 ( 0x0BC6, 0x0BC8 ), 570 ( 0x0BCA, 0x0BCD ), 571 0x0BD7, 572 ( 0x0C01, 0x0C03 ), 573 ( 0x0C3E, 0x0C44 ), 574 ( 0x0C46, 0x0C48 ), 575 ( 0x0C4A, 0x0C4D ), 576 ( 0x0C55, 0x0C56 ), 577 ( 0x0C82, 0x0C83 ), 578 ( 0x0CBE, 0x0CC4 ), 579 ( 0x0CC6, 0x0CC8 ), 580 ( 0x0CCA, 0x0CCD ), 581 ( 0x0CD5, 0x0CD6 ), 582 ( 0x0D02, 0x0D03 ), 583 ( 0x0D3E, 0x0D43 ), 584 ( 0x0D46, 0x0D48 ), 585 ( 0x0D4A, 0x0D4D ), 586 0x0D57, 587 0x0E31, 588 ( 0x0E34, 0x0E3A ), 589 ( 0x0E47, 0x0E4E ), 590 0x0EB1, 591 ( 0x0EB4, 0x0EB9 ), 592 ( 0x0EBB, 0x0EBC ), 593 ( 0x0EC8, 0x0ECD ), 594 ( 0x0F18, 0x0F19 ), 595 0x0F35, 596 0x0F37, 597 0x0F39, 598 0x0F3E, 599 0x0F3F, 600 ( 0x0F71, 0x0F84 ), 601 ( 0x0F86, 0x0F8B ), 602 ( 0x0F90, 0x0F95 ), 603 0x0F97, 604 ( 0x0F99, 0x0FAD ), 605 ( 0x0FB1, 0x0FB7 ), 606 0x0FB9, 607 ( 0x20D0, 0x20DC ), 608 0x20E1, 609 ( 0x302A, 0x302F ), 610 0x3099, 611 0x309A 612 ) 613 614 Digit = CodePointSet( 615 ( 0x0030, 0x0039 ), 616 ( 0x0660, 0x0669 ), 617 ( 0x06F0, 0x06F9 ), 618 ( 0x0966, 0x096F ), 619 ( 0x09E6, 0x09EF ), 620 ( 0x0A66, 0x0A6F ), 621 ( 0x0AE6, 0x0AEF ), 622 ( 0x0B66, 0x0B6F ), 623 ( 0x0BE7, 0x0BEF ), 624 ( 0x0C66, 0x0C6F ), 625 ( 0x0CE6, 0x0CEF ), 626 ( 0x0D66, 0x0D6F ), 627 ( 0x0E50, 0x0E59 ), 628 ( 0x0ED0, 0x0ED9 ), 629 ( 0x0F20, 0x0F29 ) 630 ) 631 632 Extender = CodePointSet( 633 0x00B7, 634 0x02D0, 635 0x02D1, 636 0x0387, 637 0x0640, 638 0x0E46, 639 0x0EC6, 640 0x3005, 641 ( 0x3031, 0x3035 ), 642 ( 0x309D, 0x309E ), 643 ( 0x30FC, 0x30FE ) 644 ) 645 646 # Not an explicit production, but used in Name production 647 NameStartChar = CodePointSet(Letter) 648 NameStartChar.add(ord('_')) 649 NameStartChar.add(ord(':')) 650 651 NCNameStartChar = CodePointSet(Letter) 652 NCNameStartChar.add(ord('_')) 653 654 NameChar = CodePointSet(Letter) 655 NameChar.extend(Digit) 656 NameChar.add(ord('.')) 657 NameChar.add(ord('-')) 658 NameChar.add(ord('_')) 659 NameChar.add(ord(':')) 660 NameChar.extend(CombiningChar) 661 NameChar.extend(Extender) 662 663 NCNameChar = CodePointSet(Letter) 664 NCNameChar.extend(Digit) 665 NCNameChar.add(ord('.')) 666 NCNameChar.add(ord('-')) 667 NCNameChar.add(ord('_')) 668 NCNameChar.extend(CombiningChar) 669 NCNameChar.extend(Extender) 670 671 Name_pat = '%s%s*' % (NameStartChar.asPattern(), NameChar.asPattern()) 672 Name_re = re.compile('^%s$' % (Name_pat,)) 673 NmToken_pat = '%s+' % (NameChar.asPattern(),) 674 NmToken_re = re.compile('^%s$' % (NmToken_pat,)) 675 NCName_pat = '%s%s*' % (NCNameStartChar.asPattern(), NCNameChar.asPattern()) 676 NCName_re = re.compile('^%s$' % (NCName_pat,)) 677 QName_pat = '(%s:)?%s' % (NCName_pat, NCName_pat) 678 QName_re = re.compile('^%s$' % (QName_pat,))
679 680 # Production 24 : Single Character Escapes 681 SingleCharEsc = { 'n' : CodePointSet(0x0A), 682 'r' : CodePointSet(0x0D), 683 't' : CodePointSet(0x09) } 684 for c in r'\|.-^?*+{}()[]': 685 SingleCharEsc[c] = CodePointSet(ord(c)) 686 687 # Production 25 : Category Escapes 688 # Production 26: Complemented Category Escapes 689 catEsc = { } 690 complEsc = { } 691 for k, v in PropertyMap.iteritems(): 692 catEsc[u'p{%s}' % (k,)] = v 693 catEsc[u'P{%s}' % (k,)] = v.negate() 694 695 # Production 36 : IsBlock escapes 696 IsBlockEsc = { } 697 for k, v in BlockMap.iteritems(): 698 IsBlockEsc[u'p{Is%s}' % (k,)] = v 699 IsBlockEsc[u'P{Is%s}' % (k,)] = v.negate() 700 701 # Production 37 : Multi-Character Escapes 702 WildcardEsc = CodePointSet(ord('\n'), ord('\r')).negate() 703 MultiCharEsc = { } 704 MultiCharEsc['s'] = CodePointSet(0x20, ord('\t'), ord('\n'), ord('\r')) 705 MultiCharEsc['S'] = MultiCharEsc['s'].negate() 706 MultiCharEsc['i'] = CodePointSet(XML1p0e2.Letter).add(ord('_')).add(ord(':')) 707 MultiCharEsc['I'] = MultiCharEsc['i'].negate() 708 MultiCharEsc['c'] = CodePointSet(XML1p0e2.NameChar) 709 MultiCharEsc['C'] = MultiCharEsc['c'].negate() 710 MultiCharEsc['d'] = PropertyMap['Nd'] 711 MultiCharEsc['D'] = MultiCharEsc['d'].negate() 712 MultiCharEsc['W'] = CodePointSet(PropertyMap['P']).extend(PropertyMap['Z']).extend(PropertyMap['C']) 713 MultiCharEsc['w'] = MultiCharEsc['W'].negate() 714