Package pyxb :: Package utils :: Module unicode
[hide private]
[frames] | no frames]

Source Code for Module pyxb.utils.unicode

  1  # -*- coding: utf-8 -*- 
  2  # Copyright 2009-2013, Peter A. Bigot 
  3  # 
  4  # Licensed under the Apache License, Version 2.0 (the "License"); you may 
  5  # not use this file except in compliance with the License. You may obtain a 
  6  # copy of the License at: 
  7  # 
  8  #            http://www.apache.org/licenses/LICENSE-2.0 
  9  # 
 10  # Unless required by applicable law or agreed to in writing, software 
 11  # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 
 12  # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 
 13  # License for the specific language governing permissions and limitations 
 14  # under the License. 
 15   
 16  """This module contains support for Unicode characters as required to 
 17  support the regular expression syntax defined in U{annex F 
 18  <http://www/Documentation/W3C/www.w3.org/TR/xmlschema-2/index.html#regexs>} 
 19  of the XML Schema definition. 
 20   
 21  In particular, we need to be able to identify character properties and 
 22  block escapes, as defined in F.1.1, by name. 
 23   
 24   - Block data: U{http://www.unicode.org/Public/3.1-Update/Blocks-4.txt} 
 25   - Property list data: U{http://www.unicode.org/Public/3.1-Update/PropList-3.1.0.txt} 
 26   - Full dataset: U{http://www.unicode.org/Public/3.1-Update/UnicodeData-3.1.0.txt} 
 27   
 28  The Unicode database active at the time XML Schema 1.0 was defined is 
 29  archived at 
 30  U{http://www.unicode.org/Public/3.1-Update/UnicodeCharacterDatabase-3.1.0.html}, 
 31  and refers to U{Unicode Standard Annex #27: Unicode 3.1 
 32  <http://www.unicode.org/unicode/reports/tr27/>}. 
 33  """ 
 34   
 35  import re 
 36  import logging 
 37  import pyxb.utils.utility 
 38   
 39  _log = logging.getLogger(__name__) 
 40   
 41  SupportsWideUnicode = False 
 42  try: 
 43      re.compile(u'[\U0001d7ce-\U0001d7ff]') 
 44      SupportsWideUnicode = True 
 45  except: 
 46      pass 
 47   
 48  import bisect 
49 50 -class CodePointSetError (LookupError):
51 """Raised when some abuse of a L{CodePointSet} is detected.""" 52 pass
53 54 @pyxb.utils.utility.BackfillComparisons
55 -class CodePointSet (object):
56 """Represent a set of Unicode code points. 57 58 Each code point is an integral value between 0 and 0x10FFFF. This 59 class is used to represent a set of code points in a manner 60 suitable for use as regular expression character sets.""" 61 62 MaxCodePoint = 0x10FFFF 63 """The maximum value for a code point in the Unicode code point 64 space. This is normally 0xFFFF, because wide unicode characters 65 are generally not enabled in Python builds. If, however, they are 66 enabled, this will be the full value of 0x10FFFF.""" 67 68 MaxShortCodePoint = 0xFFFF 69 if not SupportsWideUnicode: 70 MaxCodePoint = MaxShortCodePoint 71 72 # The internal representation of the codepoints is as a sorted 73 # list where values at an even index denote the first codepoint in 74 # a range that is in the set, and the immediately following value 75 # indicates the next following codepoint that is not in the set. 76 # A missing value at the end is interpreted as MaxCodePoint. For 77 # example, the sequence [ 12, 15, 200 ] denotes the set containing 78 # codepoints 12, 13, 14, and everything above 199. 79 __codepoints = None 80
81 - def _codepoints (self):
82 """For testing purrposes only, access to the codepoints 83 internal representation.""" 84 return self.__codepoints
85
86 - def __hash__ (self):
87 return hash(self.__codepoints)
88
89 - def __eq__ (self, other):
90 """Equality is delegated to the codepoints list.""" 91 return self.__codepoints == other.__codepoints
92
93 - def __lt__ (self, other):
94 return self.__codepoints < other.__codepoints
95
96 - def __init__ (self, *args):
97 self.__codepoints = [] 98 if 1 == len(args): 99 if isinstance(args[0], CodePointSet): 100 self.__codepoints.extend(args[0].__codepoints) 101 return 102 if isinstance(args[0], list): 103 args = args[0] 104 for a in args: 105 self.add(a)
106
107 - def __mutate (self, value, do_add):
108 # Identify the start (inclusive) and end (exclusive) code 109 # points of the value's range. 110 if isinstance(value, tuple): 111 (s, e) = value 112 e += 1 113 elif isinstance(value, basestring): 114 if 1 < len(value): 115 raise TypeError() 116 s = ord(value) 117 e = s+1 118 else: 119 s = int(value) 120 e = s+1 121 if s >= e: 122 raise ValueError('codepoint range value order') 123 124 # Validate the range for the code points supported by this 125 # Python interpreter. Recall that e is exclusive. 126 if s > self.MaxCodePoint: 127 return self 128 if e > self.MaxCodePoint: 129 e = self.MaxCodePoint+1 130 131 # Index of first code point equal to or greater than s 132 li = bisect.bisect_left(self.__codepoints, s) 133 # Index of last code point less than or equal to e 134 ri = bisect.bisect_right(self.__codepoints, e) 135 # There are four cases; if we're subtracting, they reflect. 136 case = ((li & 1) << 1) | (ri & 1) 137 if not do_add: 138 case = 3 - case 139 if 0x03 == case: 140 # Add: Incoming value begins and ends within existing ranges 141 del self.__codepoints[li:ri] 142 elif 0x02 == case: 143 # Add: Incoming value extends into an excluded range 144 del self.__codepoints[li+1:ri] 145 self.__codepoints[li] = e 146 elif 0x01 == case: 147 # Add: Incoming value begins in an excluded range 148 del self.__codepoints[li+1:ri] 149 self.__codepoints[li] = s 150 else: 151 # Add: Incoming value begins and ends within excluded ranges 152 self.__codepoints[li:ri] = [s, e] 153 return self
154
155 - def add (self, value):
156 """Add the given value to the code point set. 157 158 @param value: An integral value denoting a code point, or a 159 tuple C{(s,e)} denoting the start and end (inclusive) code 160 points in a range. 161 @return: C{self}""" 162 return self.__mutate(value, True)
163
164 - def extend (self, values):
165 """Add multiple values to a code point set. 166 167 @param values: Either a L{CodePointSet} instance, or an iterable 168 whose members are valid parameters to L{add}. 169 170 @return: C{self}""" 171 if isinstance(values, CodePointSet): 172 self.extend(values.asTuples()) 173 else: 174 for v in values: 175 self.__mutate(v, True) 176 return self
177
178 - def subtract (self, value):
179 """Remove the given value from the code point set. 180 181 @param value: An integral value denoting a code point, or a tuple 182 C{(s,e)} denoting the start and end (inclusive) code points in a 183 range, or a L{CodePointSet}. 184 185 @return: C{self}""" 186 if isinstance(value, CodePointSet): 187 for v in value.asTuples(): 188 self.subtract(v) 189 return self 190 return self.__mutate(value, False)
191 192 # Escape sequences for characters that must not appear unescaped in 193 # Python regular expression patterns. Maps each bad character to a safe 194 # escape sequence. 195 __XMLtoPythonREMap = { 196 u'\x00': u'\\x00', # From docs for Python's "re" module: Regular 197 # expression pattern strings may not contain null 198 # bytes 199 u'^': u'\\^', # Indicates negation if it happens to occur at the 200 # start of a character group 201 u'\\': u'\\\\', # Escape character 202 u'[': u'\\[', # Actually doesn't need to be escaped inside a Python 203 # character group, but escaping it is less confusing. 204 u']': u'\\]', # End of character group 205 u'-': u'\\-', # Indicates a range of characters 206 } 207 208 # Return the given code point as a unicode character suitable for 209 # use in a regular expression
210 - def __unichr (self, code_point):
211 rv = unichr(code_point) 212 rv = self.__XMLtoPythonREMap.get(rv, rv) 213 return rv
214
215 - def asPattern (self, with_brackets=True):
216 """Return the code point set as Unicode regular expression 217 character group consisting of a sequence of characters or 218 character ranges. 219 220 This returns a regular expression fragment using Python's 221 regular expression syntax. Note that different regular expression 222 syntaxes are not compatible, often in subtle ways. 223 224 @param with_brackets: If C{True} (default), square brackets 225 are added to enclose the returned character group.""" 226 rva = [] 227 if with_brackets: 228 rva.append(u'[') 229 for (s, e) in self.asTuples(): 230 if s == e: 231 rva.append(self.__unichr(s)) 232 else: 233 rva.extend([self.__unichr(s), '-', self.__unichr(e)]) 234 if with_brackets: 235 rva.append(u']') 236 return u''.join(rva)
237
238 - def asTuples (self):
239 """Return the codepoints as tuples denoting the ranges that are in 240 the set. 241 242 Each tuple C{(s, e)} indicates that the code points from C{s} 243 (inclusive) to C{e}) (inclusive) are in the set.""" 244 245 rv = [] 246 start = None 247 for ri in xrange(len(self.__codepoints)): 248 if start is not None: 249 rv.append( (start, self.__codepoints[ri]-1) ) 250 start = None 251 else: 252 start = self.__codepoints[ri] 253 if (start is not None) and (start <= self.MaxCodePoint): 254 rv.append( (start, self.MaxCodePoint) ) 255 return rv
256
257 - def negate (self):
258 """Return an instance that represents the inverse of this set.""" 259 rv = type(self)() 260 if (0 < len(self.__codepoints)) and (0 == self.__codepoints[0]): 261 rv.__codepoints.extend(self.__codepoints[1:]) 262 else: 263 rv.__codepoints.append(0) 264 rv.__codepoints.extend(self.__codepoints) 265 return rv
266
267 - def asSingleCharacter (self):
268 """If this set represents a single character, return it as its 269 unicode string value. Otherwise return C{None}.""" 270 if (2 != len(self.__codepoints)) or (1 < (self.__codepoints[1] - self.__codepoints[0])): 271 return None 272 return unichr(self.__codepoints[0])
273 274 from pyxb.utils.unicode_data import PropertyMap 275 from pyxb.utils.unicode_data import BlockMap
276 277 -class XML1p0e2 (object):
278 """Regular expression support for XML Schema Data Types. 279 280 This class holds character classes and regular expressions used to 281 constrain the lexical space of XML Schema datatypes derived from 282 U{string<http://www.w3.org/TR/xmlschema-2/#string>}. They are 283 from U{XML 1.0 (Second 284 Edition)<http://www.w3.org/TR/2000/WD-xml-2e-20000814>} and 285 U{Namespaces in XML 286 <http://www.w3.org/TR/1999/REC-xml-names-19990114/>}. 287 288 Unlike the regular expressions used for pattern constraints in XML 289 Schema, which are derived from the Unicode 3.1 specification, 290 these are derived from the Unicode 2.0 specification. 291 292 The XML Schema definition refers explicitly to the second edition 293 of XML, so we have to use these code point sets and patterns. Be 294 aware that U{subsequent updates to the XML specification 295 <http://www.w3.org/XML/xml-V10-4e-errata#E09>} have changed the 296 corresponding patterns for other uses of XML. One significant 297 change is that the original specification, used here, does not 298 allow wide unicode characters.""" 299 300 Char = CodePointSet( 301 0x0009, 302 0x000A, 303 0x000D, 304 ( 0x0020, 0xD7FF ), 305 ( 0xE000, 0xFFFD ) 306 ) 307 if SupportsWideUnicode: 308 Char.add( ( 1+CodePointSet.MaxShortCodePoint, CodePointSet.MaxCodePoint ) ) 309 310 BaseChar = CodePointSet( 311 ( 0x0041, 0x005A ), 312 ( 0x0061, 0x007A ), 313 ( 0x00C0, 0x00D6 ), 314 ( 0x00D8, 0x00F6 ), 315 ( 0x00F8, 0x00FF ), 316 ( 0x0100, 0x0131 ), 317 ( 0x0134, 0x013E ), 318 ( 0x0141, 0x0148 ), 319 ( 0x014A, 0x017E ), 320 ( 0x0180, 0x01C3 ), 321 ( 0x01CD, 0x01F0 ), 322 ( 0x01F4, 0x01F5 ), 323 ( 0x01FA, 0x0217 ), 324 ( 0x0250, 0x02A8 ), 325 ( 0x02BB, 0x02C1 ), 326 0x0386, 327 ( 0x0388, 0x038A ), 328 0x038C, 329 ( 0x038E, 0x03A1 ), 330 ( 0x03A3, 0x03CE ), 331 ( 0x03D0, 0x03D6 ), 332 0x03DA, 333 0x03DC, 334 0x03DE, 335 0x03E0, 336 ( 0x03E2, 0x03F3 ), 337 ( 0x0401, 0x040C ), 338 ( 0x040E, 0x044F ), 339 ( 0x0451, 0x045C ), 340 ( 0x045E, 0x0481 ), 341 ( 0x0490, 0x04C4 ), 342 ( 0x04C7, 0x04C8 ), 343 ( 0x04CB, 0x04CC ), 344 ( 0x04D0, 0x04EB ), 345 ( 0x04EE, 0x04F5 ), 346 ( 0x04F8, 0x04F9 ), 347 ( 0x0531, 0x0556 ), 348 0x0559, 349 ( 0x0561, 0x0586 ), 350 ( 0x05D0, 0x05EA ), 351 ( 0x05F0, 0x05F2 ), 352 ( 0x0621, 0x063A ), 353 ( 0x0641, 0x064A ), 354 ( 0x0671, 0x06B7 ), 355 ( 0x06BA, 0x06BE ), 356 ( 0x06C0, 0x06CE ), 357 ( 0x06D0, 0x06D3 ), 358 0x06D5, 359 ( 0x06E5, 0x06E6 ), 360 ( 0x0905, 0x0939 ), 361 0x093D, 362 ( 0x0958, 0x0961 ), 363 ( 0x0985, 0x098C ), 364 ( 0x098F, 0x0990 ), 365 ( 0x0993, 0x09A8 ), 366 ( 0x09AA, 0x09B0 ), 367 0x09B2, 368 ( 0x09B6, 0x09B9 ), 369 ( 0x09DC, 0x09DD ), 370 ( 0x09DF, 0x09E1 ), 371 ( 0x09F0, 0x09F1 ), 372 ( 0x0A05, 0x0A0A ), 373 ( 0x0A0F, 0x0A10 ), 374 ( 0x0A13, 0x0A28 ), 375 ( 0x0A2A, 0x0A30 ), 376 ( 0x0A32, 0x0A33 ), 377 ( 0x0A35, 0x0A36 ), 378 ( 0x0A38, 0x0A39 ), 379 ( 0x0A59, 0x0A5C ), 380 0x0A5E, 381 ( 0x0A72, 0x0A74 ), 382 ( 0x0A85, 0x0A8B ), 383 0x0A8D, 384 ( 0x0A8F, 0x0A91 ), 385 ( 0x0A93, 0x0AA8 ), 386 ( 0x0AAA, 0x0AB0 ), 387 ( 0x0AB2, 0x0AB3 ), 388 ( 0x0AB5, 0x0AB9 ), 389 0x0ABD, 390 0x0AE0, 391 ( 0x0B05, 0x0B0C ), 392 ( 0x0B0F, 0x0B10 ), 393 ( 0x0B13, 0x0B28 ), 394 ( 0x0B2A, 0x0B30 ), 395 ( 0x0B32, 0x0B33 ), 396 ( 0x0B36, 0x0B39 ), 397 0x0B3D, 398 ( 0x0B5C, 0x0B5D ), 399 ( 0x0B5F, 0x0B61 ), 400 ( 0x0B85, 0x0B8A ), 401 ( 0x0B8E, 0x0B90 ), 402 ( 0x0B92, 0x0B95 ), 403 ( 0x0B99, 0x0B9A ), 404 0x0B9C, 405 ( 0x0B9E, 0x0B9F ), 406 ( 0x0BA3, 0x0BA4 ), 407 ( 0x0BA8, 0x0BAA ), 408 ( 0x0BAE, 0x0BB5 ), 409 ( 0x0BB7, 0x0BB9 ), 410 ( 0x0C05, 0x0C0C ), 411 ( 0x0C0E, 0x0C10 ), 412 ( 0x0C12, 0x0C28 ), 413 ( 0x0C2A, 0x0C33 ), 414 ( 0x0C35, 0x0C39 ), 415 ( 0x0C60, 0x0C61 ), 416 ( 0x0C85, 0x0C8C ), 417 ( 0x0C8E, 0x0C90 ), 418 ( 0x0C92, 0x0CA8 ), 419 ( 0x0CAA, 0x0CB3 ), 420 ( 0x0CB5, 0x0CB9 ), 421 0x0CDE, 422 ( 0x0CE0, 0x0CE1 ), 423 ( 0x0D05, 0x0D0C ), 424 ( 0x0D0E, 0x0D10 ), 425 ( 0x0D12, 0x0D28 ), 426 ( 0x0D2A, 0x0D39 ), 427 ( 0x0D60, 0x0D61 ), 428 ( 0x0E01, 0x0E2E ), 429 0x0E30, 430 ( 0x0E32, 0x0E33 ), 431 ( 0x0E40, 0x0E45 ), 432 ( 0x0E81, 0x0E82 ), 433 0x0E84, 434 ( 0x0E87, 0x0E88 ), 435 0x0E8A, 436 0x0E8D, 437 ( 0x0E94, 0x0E97 ), 438 ( 0x0E99, 0x0E9F ), 439 ( 0x0EA1, 0x0EA3 ), 440 0x0EA5, 441 0x0EA7, 442 ( 0x0EAA, 0x0EAB ), 443 ( 0x0EAD, 0x0EAE ), 444 0x0EB0, 445 ( 0x0EB2, 0x0EB3 ), 446 0x0EBD, 447 ( 0x0EC0, 0x0EC4 ), 448 ( 0x0F40, 0x0F47 ), 449 ( 0x0F49, 0x0F69 ), 450 ( 0x10A0, 0x10C5 ), 451 ( 0x10D0, 0x10F6 ), 452 0x1100, 453 ( 0x1102, 0x1103 ), 454 ( 0x1105, 0x1107 ), 455 0x1109, 456 ( 0x110B, 0x110C ), 457 ( 0x110E, 0x1112 ), 458 0x113C, 459 0x113E, 460 0x1140, 461 0x114C, 462 0x114E, 463 0x1150, 464 ( 0x1154, 0x1155 ), 465 0x1159, 466 ( 0x115F, 0x1161 ), 467 0x1163, 468 0x1165, 469 0x1167, 470 0x1169, 471 ( 0x116D, 0x116E ), 472 ( 0x1172, 0x1173 ), 473 0x1175, 474 0x119E, 475 0x11A8, 476 0x11AB, 477 ( 0x11AE, 0x11AF ), 478 ( 0x11B7, 0x11B8 ), 479 0x11BA, 480 ( 0x11BC, 0x11C2 ), 481 0x11EB, 482 0x11F0, 483 0x11F9, 484 ( 0x1E00, 0x1E9B ), 485 ( 0x1EA0, 0x1EF9 ), 486 ( 0x1F00, 0x1F15 ), 487 ( 0x1F18, 0x1F1D ), 488 ( 0x1F20, 0x1F45 ), 489 ( 0x1F48, 0x1F4D ), 490 ( 0x1F50, 0x1F57 ), 491 0x1F59, 492 0x1F5B, 493 0x1F5D, 494 ( 0x1F5F, 0x1F7D ), 495 ( 0x1F80, 0x1FB4 ), 496 ( 0x1FB6, 0x1FBC ), 497 0x1FBE, 498 ( 0x1FC2, 0x1FC4 ), 499 ( 0x1FC6, 0x1FCC ), 500 ( 0x1FD0, 0x1FD3 ), 501 ( 0x1FD6, 0x1FDB ), 502 ( 0x1FE0, 0x1FEC ), 503 ( 0x1FF2, 0x1FF4 ), 504 ( 0x1FF6, 0x1FFC ), 505 0x2126, 506 ( 0x212A, 0x212B ), 507 0x212E, 508 ( 0x2180, 0x2182 ), 509 ( 0x3041, 0x3094 ), 510 ( 0x30A1, 0x30FA ), 511 ( 0x3105, 0x312C ), 512 ( 0xAC00, 0xD7A3 ) 513 ) 514 515 Ideographic = CodePointSet( 516 ( 0x4E00, 0x9FA5 ), 517 0x3007, 518 ( 0x3021, 0x3029 ) 519 ) 520 521 Letter = CodePointSet(BaseChar).extend(Ideographic) 522 523 CombiningChar = CodePointSet( 524 ( 0x0300, 0x0345 ), 525 ( 0x0360, 0x0361 ), 526 ( 0x0483, 0x0486 ), 527 ( 0x0591, 0x05A1 ), 528 ( 0x05A3, 0x05B9 ), 529 ( 0x05BB, 0x05BD ), 530 0x05BF, 531 ( 0x05C1, 0x05C2 ), 532 0x05C4, 533 ( 0x064B, 0x0652 ), 534 0x0670, 535 ( 0x06D6, 0x06DC ), 536 ( 0x06DD, 0x06DF ), 537 ( 0x06E0, 0x06E4 ), 538 ( 0x06E7, 0x06E8 ), 539 ( 0x06EA, 0x06ED ), 540 ( 0x0901, 0x0903 ), 541 0x093C, 542 ( 0x093E, 0x094C ), 543 0x094D, 544 ( 0x0951, 0x0954 ), 545 ( 0x0962, 0x0963 ), 546 ( 0x0981, 0x0983 ), 547 0x09BC, 548 0x09BE, 549 0x09BF, 550 ( 0x09C0, 0x09C4 ), 551 ( 0x09C7, 0x09C8 ), 552 ( 0x09CB, 0x09CD ), 553 0x09D7, 554 ( 0x09E2, 0x09E3 ), 555 0x0A02, 556 0x0A3C, 557 0x0A3E, 558 0x0A3F, 559 ( 0x0A40, 0x0A42 ), 560 ( 0x0A47, 0x0A48 ), 561 ( 0x0A4B, 0x0A4D ), 562 ( 0x0A70, 0x0A71 ), 563 ( 0x0A81, 0x0A83 ), 564 0x0ABC, 565 ( 0x0ABE, 0x0AC5 ), 566 ( 0x0AC7, 0x0AC9 ), 567 ( 0x0ACB, 0x0ACD ), 568 ( 0x0B01, 0x0B03 ), 569 0x0B3C, 570 ( 0x0B3E, 0x0B43 ), 571 ( 0x0B47, 0x0B48 ), 572 ( 0x0B4B, 0x0B4D ), 573 ( 0x0B56, 0x0B57 ), 574 ( 0x0B82, 0x0B83 ), 575 ( 0x0BBE, 0x0BC2 ), 576 ( 0x0BC6, 0x0BC8 ), 577 ( 0x0BCA, 0x0BCD ), 578 0x0BD7, 579 ( 0x0C01, 0x0C03 ), 580 ( 0x0C3E, 0x0C44 ), 581 ( 0x0C46, 0x0C48 ), 582 ( 0x0C4A, 0x0C4D ), 583 ( 0x0C55, 0x0C56 ), 584 ( 0x0C82, 0x0C83 ), 585 ( 0x0CBE, 0x0CC4 ), 586 ( 0x0CC6, 0x0CC8 ), 587 ( 0x0CCA, 0x0CCD ), 588 ( 0x0CD5, 0x0CD6 ), 589 ( 0x0D02, 0x0D03 ), 590 ( 0x0D3E, 0x0D43 ), 591 ( 0x0D46, 0x0D48 ), 592 ( 0x0D4A, 0x0D4D ), 593 0x0D57, 594 0x0E31, 595 ( 0x0E34, 0x0E3A ), 596 ( 0x0E47, 0x0E4E ), 597 0x0EB1, 598 ( 0x0EB4, 0x0EB9 ), 599 ( 0x0EBB, 0x0EBC ), 600 ( 0x0EC8, 0x0ECD ), 601 ( 0x0F18, 0x0F19 ), 602 0x0F35, 603 0x0F37, 604 0x0F39, 605 0x0F3E, 606 0x0F3F, 607 ( 0x0F71, 0x0F84 ), 608 ( 0x0F86, 0x0F8B ), 609 ( 0x0F90, 0x0F95 ), 610 0x0F97, 611 ( 0x0F99, 0x0FAD ), 612 ( 0x0FB1, 0x0FB7 ), 613 0x0FB9, 614 ( 0x20D0, 0x20DC ), 615 0x20E1, 616 ( 0x302A, 0x302F ), 617 0x3099, 618 0x309A 619 ) 620 621 Digit = CodePointSet( 622 ( 0x0030, 0x0039 ), 623 ( 0x0660, 0x0669 ), 624 ( 0x06F0, 0x06F9 ), 625 ( 0x0966, 0x096F ), 626 ( 0x09E6, 0x09EF ), 627 ( 0x0A66, 0x0A6F ), 628 ( 0x0AE6, 0x0AEF ), 629 ( 0x0B66, 0x0B6F ), 630 ( 0x0BE7, 0x0BEF ), 631 ( 0x0C66, 0x0C6F ), 632 ( 0x0CE6, 0x0CEF ), 633 ( 0x0D66, 0x0D6F ), 634 ( 0x0E50, 0x0E59 ), 635 ( 0x0ED0, 0x0ED9 ), 636 ( 0x0F20, 0x0F29 ) 637 ) 638 639 Extender = CodePointSet( 640 0x00B7, 641 0x02D0, 642 0x02D1, 643 0x0387, 644 0x0640, 645 0x0E46, 646 0x0EC6, 647 0x3005, 648 ( 0x3031, 0x3035 ), 649 ( 0x309D, 0x309E ), 650 ( 0x30FC, 0x30FE ) 651 ) 652 653 # Not an explicit production, but used in Name production 654 NameStartChar = CodePointSet(Letter) 655 NameStartChar.add(ord('_')) 656 NameStartChar.add(ord(':')) 657 658 NCNameStartChar = CodePointSet(Letter) 659 NCNameStartChar.add(ord('_')) 660 661 NameChar = CodePointSet(Letter) 662 NameChar.extend(Digit) 663 NameChar.add(ord('.')) 664 NameChar.add(ord('-')) 665 NameChar.add(ord('_')) 666 NameChar.add(ord(':')) 667 NameChar.extend(CombiningChar) 668 NameChar.extend(Extender) 669 670 NCNameChar = CodePointSet(Letter) 671 NCNameChar.extend(Digit) 672 NCNameChar.add(ord('.')) 673 NCNameChar.add(ord('-')) 674 NCNameChar.add(ord('_')) 675 NCNameChar.extend(CombiningChar) 676 NCNameChar.extend(Extender) 677 678 Name_pat = '%s%s*' % (NameStartChar.asPattern(), NameChar.asPattern()) 679 Name_re = re.compile('^%s$' % (Name_pat,)) 680 NmToken_pat = '%s+' % (NameChar.asPattern(),) 681 NmToken_re = re.compile('^%s$' % (NmToken_pat,)) 682 NCName_pat = '%s%s*' % (NCNameStartChar.asPattern(), NCNameChar.asPattern()) 683 NCName_re = re.compile('^%s$' % (NCName_pat,)) 684 QName_pat = '(%s:)?%s' % (NCName_pat, NCName_pat) 685 QName_re = re.compile('^%s$' % (QName_pat,))
686 687 # Production 24 : Single Character Escapes 688 SingleCharEsc = { 'n' : CodePointSet(0x0A), 689 'r' : CodePointSet(0x0D), 690 't' : CodePointSet(0x09) } 691 for c in r'\|.-^?*+{}()[]': 692 SingleCharEsc[c] = CodePointSet(ord(c)) 693 694 # Production 25 : Category Escapes 695 # Production 26: Complemented Category Escapes 696 catEsc = { } 697 complEsc = { } 698 for k, v in PropertyMap.iteritems(): 699 catEsc[u'p{%s}' % (k,)] = v 700 catEsc[u'P{%s}' % (k,)] = v.negate() 701 702 # Production 36 : IsBlock escapes 703 IsBlockEsc = { } 704 for k, v in BlockMap.iteritems(): 705 IsBlockEsc[u'p{Is%s}' % (k,)] = v 706 IsBlockEsc[u'P{Is%s}' % (k,)] = v.negate() 707 708 # Production 37 : Multi-Character Escapes 709 WildcardEsc = CodePointSet(ord('\n'), ord('\r')).negate() 710 MultiCharEsc = { } 711 MultiCharEsc['s'] = CodePointSet(0x20, ord('\t'), ord('\n'), ord('\r')) 712 MultiCharEsc['S'] = MultiCharEsc['s'].negate() 713 MultiCharEsc['i'] = CodePointSet(XML1p0e2.Letter).add(ord('_')).add(ord(':')) 714 MultiCharEsc['I'] = MultiCharEsc['i'].negate() 715 MultiCharEsc['c'] = CodePointSet(XML1p0e2.NameChar) 716 MultiCharEsc['C'] = MultiCharEsc['c'].negate() 717 MultiCharEsc['d'] = PropertyMap['Nd'] 718 MultiCharEsc['D'] = MultiCharEsc['d'].negate() 719 MultiCharEsc['W'] = CodePointSet(PropertyMap['P']).extend(PropertyMap['Z']).extend(PropertyMap['C']) 720 MultiCharEsc['w'] = MultiCharEsc['W'].negate() 721