Package pyxb :: Package utils :: Module unicode
[hide private]
[frames] | no frames]

Source Code for Module pyxb.utils.unicode

  1  # -*- coding: utf-8 -*- 
  2  # Copyright 2009-2013, Peter A. Bigot 
  3  # 
  4  # Licensed under the Apache License, Version 2.0 (the "License"); you may 
  5  # not use this file except in compliance with the License. You may obtain a 
  6  # copy of the License at: 
  7  # 
  8  #            http://www.apache.org/licenses/LICENSE-2.0 
  9  # 
 10  # Unless required by applicable law or agreed to in writing, software 
 11  # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 
 12  # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 
 13  # License for the specific language governing permissions and limitations 
 14  # under the License. 
 15   
 16  """This module contains support for Unicode characters as required to 
 17  support the regular expression syntax defined in U{annex F 
 18  <http://www/Documentation/W3C/www.w3.org/TR/xmlschema-2/index.html#regexs>} 
 19  of the XML Schema definition. 
 20   
 21  In particular, we need to be able to identify character properties and 
 22  block escapes, as defined in F.1.1, by name. 
 23   
 24   - Block data: U{http://www.unicode.org/Public/3.1-Update/Blocks-4.txt} 
 25   - Property list data: U{http://www.unicode.org/Public/3.1-Update/PropList-3.1.0.txt} 
 26   - Full dataset: U{http://www.unicode.org/Public/3.1-Update/UnicodeData-3.1.0.txt} 
 27   
 28  The Unicode database active at the time XML Schema 1.0 was defined is 
 29  archived at 
 30  U{http://www.unicode.org/Public/3.1-Update/UnicodeCharacterDatabase-3.1.0.html}, 
 31  and refers to U{Unicode Standard Annex #27: Unicode 3.1 
 32  <http://www.unicode.org/unicode/reports/tr27/>}. 
 33  """ 
 34   
 35  import re 
 36  import logging 
 37  import pyxb.utils.utility 
 38  from pyxb.utils import six 
 39  from pyxb.utils.six.moves import xrange 
 40   
 41  _log = logging.getLogger(__name__) 
 42   
 43  SupportsWideUnicode = False 
 44  try: 
 45      re.compile(six.u('[\U0001d7ce-\U0001d7ff]')) 
 46      SupportsWideUnicode = True 
 47  except: 
 48      pass 
 49   
 50  import bisect 
51 52 -class CodePointSetError (LookupError):
53 """Raised when some abuse of a L{CodePointSet} is detected.""" 54 pass
55 56 @pyxb.utils.utility.BackfillComparisons
57 -class CodePointSet (object):
58 """Represent a set of Unicode code points. 59 60 Each code point is an integral value between 0 and 0x10FFFF. This 61 class is used to represent a set of code points in a manner 62 suitable for use as regular expression character sets.""" 63 64 MaxCodePoint = 0x10FFFF 65 """The maximum value for a code point in the Unicode code point 66 space. This is normally 0xFFFF, because wide unicode characters 67 are generally not enabled in Python builds. If, however, they are 68 enabled, this will be the full value of 0x10FFFF.""" 69 70 MaxShortCodePoint = 0xFFFF 71 if not SupportsWideUnicode: 72 MaxCodePoint = MaxShortCodePoint 73 74 # The internal representation of the codepoints is as a sorted 75 # list where values at an even index denote the first codepoint in 76 # a range that is in the set, and the immediately following value 77 # indicates the next following codepoint that is not in the set. 78 # A missing value at the end is interpreted as MaxCodePoint. For 79 # example, the sequence [ 12, 15, 200 ] denotes the set containing 80 # codepoints 12, 13, 14, and everything above 199. 81 __codepoints = None 82
83 - def _codepoints (self):
84 """For testing purrposes only, access to the codepoints 85 internal representation.""" 86 return self.__codepoints
87
88 - def __hash__ (self):
89 return hash(self.__codepoints)
90
91 - def __eq__ (self, other):
92 """Equality is delegated to the codepoints list.""" 93 return self.__codepoints == other.__codepoints
94
95 - def __lt__ (self, other):
96 return self.__codepoints < other.__codepoints
97
98 - def __init__ (self, *args):
99 self.__codepoints = [] 100 if 1 == len(args): 101 if isinstance(args[0], CodePointSet): 102 self.__codepoints.extend(args[0].__codepoints) 103 return 104 if isinstance(args[0], list): 105 args = args[0] 106 for a in args: 107 self.add(a)
108
109 - def __mutate (self, value, do_add):
110 # Identify the start (inclusive) and end (exclusive) code 111 # points of the value's range. 112 if isinstance(value, tuple): 113 (s, e) = value 114 e += 1 115 elif isinstance(value, six.string_types): 116 if 1 < len(value): 117 raise TypeError() 118 s = ord(value) 119 e = s+1 120 else: 121 s = int(value) 122 e = s+1 123 if s >= e: 124 raise ValueError('codepoint range value order') 125 126 # Validate the range for the code points supported by this 127 # Python interpreter. Recall that e is exclusive. 128 if s > self.MaxCodePoint: 129 return self 130 if e > self.MaxCodePoint: 131 e = self.MaxCodePoint+1 132 133 # Index of first code point equal to or greater than s 134 li = bisect.bisect_left(self.__codepoints, s) 135 # Index of last code point less than or equal to e 136 ri = bisect.bisect_right(self.__codepoints, e) 137 # There are four cases; if we're subtracting, they reflect. 138 case = ((li & 1) << 1) | (ri & 1) 139 if not do_add: 140 case = 3 - case 141 if 0x03 == case: 142 # Add: Incoming value begins and ends within existing ranges 143 del self.__codepoints[li:ri] 144 elif 0x02 == case: 145 # Add: Incoming value extends into an excluded range 146 del self.__codepoints[li+1:ri] 147 self.__codepoints[li] = e 148 elif 0x01 == case: 149 # Add: Incoming value begins in an excluded range 150 del self.__codepoints[li+1:ri] 151 self.__codepoints[li] = s 152 else: 153 # Add: Incoming value begins and ends within excluded ranges 154 self.__codepoints[li:ri] = [s, e] 155 return self
156
157 - def add (self, value):
158 """Add the given value to the code point set. 159 160 @param value: An integral value denoting a code point, or a 161 tuple C{(s,e)} denoting the start and end (inclusive) code 162 points in a range. 163 @return: C{self}""" 164 return self.__mutate(value, True)
165
166 - def extend (self, values):
167 """Add multiple values to a code point set. 168 169 @param values: Either a L{CodePointSet} instance, or an iterable 170 whose members are valid parameters to L{add}. 171 172 @return: C{self}""" 173 if isinstance(values, CodePointSet): 174 self.extend(values.asTuples()) 175 else: 176 for v in values: 177 self.__mutate(v, True) 178 return self
179
180 - def subtract (self, value):
181 """Remove the given value from the code point set. 182 183 @param value: An integral value denoting a code point, or a tuple 184 C{(s,e)} denoting the start and end (inclusive) code points in a 185 range, or a L{CodePointSet}. 186 187 @return: C{self}""" 188 if isinstance(value, CodePointSet): 189 for v in value.asTuples(): 190 self.subtract(v) 191 return self 192 return self.__mutate(value, False)
193 194 # Escape sequences for characters that must not appear unescaped in 195 # Python regular expression patterns. Maps each bad character to a safe 196 # escape sequence. 197 __XMLtoPythonREEscapedCodepoints = ( 198 # From docs for Python's "re" module: Regular expression 199 # pattern strings may not contain null bytes 200 0, 201 # Indicates negation if it happens to occur at the start of a 202 # character group 203 ord('^'), 204 # Escape character (backslash) 205 ord('\\'), 206 # Actually doesn't need to be escaped inside a Python 207 # character group, but escaping it is less confusing. 208 ord('['), 209 # End of character group 210 ord(']'), 211 # Indicates a range of characters 212 ord('-') 213 ) 214 215 # Return the given code point as a unicode character suitable for 216 # use in a regular expression
217 - def __unichr (self, code_point):
218 rv = six.unichr(code_point) 219 if 0 == code_point: 220 rv = six.u('x00') 221 if code_point in self.__XMLtoPythonREEscapedCodepoints: 222 rv = six.unichr(0x5c) + rv 223 return rv
224
225 - def asPattern (self, with_brackets=True):
226 """Return the code point set as Unicode regular expression 227 character group consisting of a sequence of characters or 228 character ranges. 229 230 This returns a regular expression fragment using Python's 231 regular expression syntax. Note that different regular expression 232 syntaxes are not compatible, often in subtle ways. 233 234 @param with_brackets: If C{True} (default), square brackets 235 are added to enclose the returned character group.""" 236 rva = [] 237 if with_brackets: 238 rva.append(six.u('[')) 239 for (s, e) in self.asTuples(): 240 if s == e: 241 rva.append(self.__unichr(s)) 242 else: 243 rva.extend([self.__unichr(s), '-', self.__unichr(e)]) 244 if with_brackets: 245 rva.append(six.u(']')) 246 return six.u('').join(rva)
247
248 - def asTuples (self):
249 """Return the codepoints as tuples denoting the ranges that are in 250 the set. 251 252 Each tuple C{(s, e)} indicates that the code points from C{s} 253 (inclusive) to C{e}) (inclusive) are in the set.""" 254 255 rv = [] 256 start = None 257 for ri in xrange(len(self.__codepoints)): 258 if start is not None: 259 rv.append( (start, self.__codepoints[ri]-1) ) 260 start = None 261 else: 262 start = self.__codepoints[ri] 263 if (start is not None) and (start <= self.MaxCodePoint): 264 rv.append( (start, self.MaxCodePoint) ) 265 return rv
266
267 - def negate (self):
268 """Return an instance that represents the inverse of this set.""" 269 rv = type(self)() 270 if (0 < len(self.__codepoints)) and (0 == self.__codepoints[0]): 271 rv.__codepoints.extend(self.__codepoints[1:]) 272 else: 273 rv.__codepoints.append(0) 274 rv.__codepoints.extend(self.__codepoints) 275 return rv
276
277 - def asSingleCharacter (self):
278 """If this set represents a single character, return it as its 279 unicode string value. Otherwise return C{None}.""" 280 if (2 != len(self.__codepoints)) or (1 < (self.__codepoints[1] - self.__codepoints[0])): 281 return None 282 return six.unichr(self.__codepoints[0])
283 284 from pyxb.utils.unicode_data import PropertyMap 285 from pyxb.utils.unicode_data import BlockMap
286 287 -class XML1p0e2 (object):
288 """Regular expression support for XML Schema Data Types. 289 290 This class holds character classes and regular expressions used to 291 constrain the lexical space of XML Schema datatypes derived from 292 U{string<http://www.w3.org/TR/xmlschema-2/#string>}. They are 293 from U{XML 1.0 (Second 294 Edition)<http://www.w3.org/TR/2000/WD-xml-2e-20000814>} and 295 U{Namespaces in XML 296 <http://www.w3.org/TR/1999/REC-xml-names-19990114/>}. 297 298 Unlike the regular expressions used for pattern constraints in XML 299 Schema, which are derived from the Unicode 3.1 specification, 300 these are derived from the Unicode 2.0 specification. 301 302 The XML Schema definition refers explicitly to the second edition 303 of XML, so we have to use these code point sets and patterns. Be 304 aware that U{subsequent updates to the XML specification 305 <http://www.w3.org/XML/xml-V10-4e-errata#E09>} have changed the 306 corresponding patterns for other uses of XML. One significant 307 change is that the original specification, used here, does not 308 allow wide unicode characters.""" 309 310 Char = CodePointSet( 311 0x0009, 312 0x000A, 313 0x000D, 314 ( 0x0020, 0xD7FF ), 315 ( 0xE000, 0xFFFD ) 316 ) 317 if SupportsWideUnicode: 318 Char.add( ( 1+CodePointSet.MaxShortCodePoint, CodePointSet.MaxCodePoint ) ) 319 320 BaseChar = CodePointSet( 321 ( 0x0041, 0x005A ), 322 ( 0x0061, 0x007A ), 323 ( 0x00C0, 0x00D6 ), 324 ( 0x00D8, 0x00F6 ), 325 ( 0x00F8, 0x00FF ), 326 ( 0x0100, 0x0131 ), 327 ( 0x0134, 0x013E ), 328 ( 0x0141, 0x0148 ), 329 ( 0x014A, 0x017E ), 330 ( 0x0180, 0x01C3 ), 331 ( 0x01CD, 0x01F0 ), 332 ( 0x01F4, 0x01F5 ), 333 ( 0x01FA, 0x0217 ), 334 ( 0x0250, 0x02A8 ), 335 ( 0x02BB, 0x02C1 ), 336 0x0386, 337 ( 0x0388, 0x038A ), 338 0x038C, 339 ( 0x038E, 0x03A1 ), 340 ( 0x03A3, 0x03CE ), 341 ( 0x03D0, 0x03D6 ), 342 0x03DA, 343 0x03DC, 344 0x03DE, 345 0x03E0, 346 ( 0x03E2, 0x03F3 ), 347 ( 0x0401, 0x040C ), 348 ( 0x040E, 0x044F ), 349 ( 0x0451, 0x045C ), 350 ( 0x045E, 0x0481 ), 351 ( 0x0490, 0x04C4 ), 352 ( 0x04C7, 0x04C8 ), 353 ( 0x04CB, 0x04CC ), 354 ( 0x04D0, 0x04EB ), 355 ( 0x04EE, 0x04F5 ), 356 ( 0x04F8, 0x04F9 ), 357 ( 0x0531, 0x0556 ), 358 0x0559, 359 ( 0x0561, 0x0586 ), 360 ( 0x05D0, 0x05EA ), 361 ( 0x05F0, 0x05F2 ), 362 ( 0x0621, 0x063A ), 363 ( 0x0641, 0x064A ), 364 ( 0x0671, 0x06B7 ), 365 ( 0x06BA, 0x06BE ), 366 ( 0x06C0, 0x06CE ), 367 ( 0x06D0, 0x06D3 ), 368 0x06D5, 369 ( 0x06E5, 0x06E6 ), 370 ( 0x0905, 0x0939 ), 371 0x093D, 372 ( 0x0958, 0x0961 ), 373 ( 0x0985, 0x098C ), 374 ( 0x098F, 0x0990 ), 375 ( 0x0993, 0x09A8 ), 376 ( 0x09AA, 0x09B0 ), 377 0x09B2, 378 ( 0x09B6, 0x09B9 ), 379 ( 0x09DC, 0x09DD ), 380 ( 0x09DF, 0x09E1 ), 381 ( 0x09F0, 0x09F1 ), 382 ( 0x0A05, 0x0A0A ), 383 ( 0x0A0F, 0x0A10 ), 384 ( 0x0A13, 0x0A28 ), 385 ( 0x0A2A, 0x0A30 ), 386 ( 0x0A32, 0x0A33 ), 387 ( 0x0A35, 0x0A36 ), 388 ( 0x0A38, 0x0A39 ), 389 ( 0x0A59, 0x0A5C ), 390 0x0A5E, 391 ( 0x0A72, 0x0A74 ), 392 ( 0x0A85, 0x0A8B ), 393 0x0A8D, 394 ( 0x0A8F, 0x0A91 ), 395 ( 0x0A93, 0x0AA8 ), 396 ( 0x0AAA, 0x0AB0 ), 397 ( 0x0AB2, 0x0AB3 ), 398 ( 0x0AB5, 0x0AB9 ), 399 0x0ABD, 400 0x0AE0, 401 ( 0x0B05, 0x0B0C ), 402 ( 0x0B0F, 0x0B10 ), 403 ( 0x0B13, 0x0B28 ), 404 ( 0x0B2A, 0x0B30 ), 405 ( 0x0B32, 0x0B33 ), 406 ( 0x0B36, 0x0B39 ), 407 0x0B3D, 408 ( 0x0B5C, 0x0B5D ), 409 ( 0x0B5F, 0x0B61 ), 410 ( 0x0B85, 0x0B8A ), 411 ( 0x0B8E, 0x0B90 ), 412 ( 0x0B92, 0x0B95 ), 413 ( 0x0B99, 0x0B9A ), 414 0x0B9C, 415 ( 0x0B9E, 0x0B9F ), 416 ( 0x0BA3, 0x0BA4 ), 417 ( 0x0BA8, 0x0BAA ), 418 ( 0x0BAE, 0x0BB5 ), 419 ( 0x0BB7, 0x0BB9 ), 420 ( 0x0C05, 0x0C0C ), 421 ( 0x0C0E, 0x0C10 ), 422 ( 0x0C12, 0x0C28 ), 423 ( 0x0C2A, 0x0C33 ), 424 ( 0x0C35, 0x0C39 ), 425 ( 0x0C60, 0x0C61 ), 426 ( 0x0C85, 0x0C8C ), 427 ( 0x0C8E, 0x0C90 ), 428 ( 0x0C92, 0x0CA8 ), 429 ( 0x0CAA, 0x0CB3 ), 430 ( 0x0CB5, 0x0CB9 ), 431 0x0CDE, 432 ( 0x0CE0, 0x0CE1 ), 433 ( 0x0D05, 0x0D0C ), 434 ( 0x0D0E, 0x0D10 ), 435 ( 0x0D12, 0x0D28 ), 436 ( 0x0D2A, 0x0D39 ), 437 ( 0x0D60, 0x0D61 ), 438 ( 0x0E01, 0x0E2E ), 439 0x0E30, 440 ( 0x0E32, 0x0E33 ), 441 ( 0x0E40, 0x0E45 ), 442 ( 0x0E81, 0x0E82 ), 443 0x0E84, 444 ( 0x0E87, 0x0E88 ), 445 0x0E8A, 446 0x0E8D, 447 ( 0x0E94, 0x0E97 ), 448 ( 0x0E99, 0x0E9F ), 449 ( 0x0EA1, 0x0EA3 ), 450 0x0EA5, 451 0x0EA7, 452 ( 0x0EAA, 0x0EAB ), 453 ( 0x0EAD, 0x0EAE ), 454 0x0EB0, 455 ( 0x0EB2, 0x0EB3 ), 456 0x0EBD, 457 ( 0x0EC0, 0x0EC4 ), 458 ( 0x0F40, 0x0F47 ), 459 ( 0x0F49, 0x0F69 ), 460 ( 0x10A0, 0x10C5 ), 461 ( 0x10D0, 0x10F6 ), 462 0x1100, 463 ( 0x1102, 0x1103 ), 464 ( 0x1105, 0x1107 ), 465 0x1109, 466 ( 0x110B, 0x110C ), 467 ( 0x110E, 0x1112 ), 468 0x113C, 469 0x113E, 470 0x1140, 471 0x114C, 472 0x114E, 473 0x1150, 474 ( 0x1154, 0x1155 ), 475 0x1159, 476 ( 0x115F, 0x1161 ), 477 0x1163, 478 0x1165, 479 0x1167, 480 0x1169, 481 ( 0x116D, 0x116E ), 482 ( 0x1172, 0x1173 ), 483 0x1175, 484 0x119E, 485 0x11A8, 486 0x11AB, 487 ( 0x11AE, 0x11AF ), 488 ( 0x11B7, 0x11B8 ), 489 0x11BA, 490 ( 0x11BC, 0x11C2 ), 491 0x11EB, 492 0x11F0, 493 0x11F9, 494 ( 0x1E00, 0x1E9B ), 495 ( 0x1EA0, 0x1EF9 ), 496 ( 0x1F00, 0x1F15 ), 497 ( 0x1F18, 0x1F1D ), 498 ( 0x1F20, 0x1F45 ), 499 ( 0x1F48, 0x1F4D ), 500 ( 0x1F50, 0x1F57 ), 501 0x1F59, 502 0x1F5B, 503 0x1F5D, 504 ( 0x1F5F, 0x1F7D ), 505 ( 0x1F80, 0x1FB4 ), 506 ( 0x1FB6, 0x1FBC ), 507 0x1FBE, 508 ( 0x1FC2, 0x1FC4 ), 509 ( 0x1FC6, 0x1FCC ), 510 ( 0x1FD0, 0x1FD3 ), 511 ( 0x1FD6, 0x1FDB ), 512 ( 0x1FE0, 0x1FEC ), 513 ( 0x1FF2, 0x1FF4 ), 514 ( 0x1FF6, 0x1FFC ), 515 0x2126, 516 ( 0x212A, 0x212B ), 517 0x212E, 518 ( 0x2180, 0x2182 ), 519 ( 0x3041, 0x3094 ), 520 ( 0x30A1, 0x30FA ), 521 ( 0x3105, 0x312C ), 522 ( 0xAC00, 0xD7A3 ) 523 ) 524 525 Ideographic = CodePointSet( 526 ( 0x4E00, 0x9FA5 ), 527 0x3007, 528 ( 0x3021, 0x3029 ) 529 ) 530 531 Letter = CodePointSet(BaseChar).extend(Ideographic) 532 533 CombiningChar = CodePointSet( 534 ( 0x0300, 0x0345 ), 535 ( 0x0360, 0x0361 ), 536 ( 0x0483, 0x0486 ), 537 ( 0x0591, 0x05A1 ), 538 ( 0x05A3, 0x05B9 ), 539 ( 0x05BB, 0x05BD ), 540 0x05BF, 541 ( 0x05C1, 0x05C2 ), 542 0x05C4, 543 ( 0x064B, 0x0652 ), 544 0x0670, 545 ( 0x06D6, 0x06DC ), 546 ( 0x06DD, 0x06DF ), 547 ( 0x06E0, 0x06E4 ), 548 ( 0x06E7, 0x06E8 ), 549 ( 0x06EA, 0x06ED ), 550 ( 0x0901, 0x0903 ), 551 0x093C, 552 ( 0x093E, 0x094C ), 553 0x094D, 554 ( 0x0951, 0x0954 ), 555 ( 0x0962, 0x0963 ), 556 ( 0x0981, 0x0983 ), 557 0x09BC, 558 0x09BE, 559 0x09BF, 560 ( 0x09C0, 0x09C4 ), 561 ( 0x09C7, 0x09C8 ), 562 ( 0x09CB, 0x09CD ), 563 0x09D7, 564 ( 0x09E2, 0x09E3 ), 565 0x0A02, 566 0x0A3C, 567 0x0A3E, 568 0x0A3F, 569 ( 0x0A40, 0x0A42 ), 570 ( 0x0A47, 0x0A48 ), 571 ( 0x0A4B, 0x0A4D ), 572 ( 0x0A70, 0x0A71 ), 573 ( 0x0A81, 0x0A83 ), 574 0x0ABC, 575 ( 0x0ABE, 0x0AC5 ), 576 ( 0x0AC7, 0x0AC9 ), 577 ( 0x0ACB, 0x0ACD ), 578 ( 0x0B01, 0x0B03 ), 579 0x0B3C, 580 ( 0x0B3E, 0x0B43 ), 581 ( 0x0B47, 0x0B48 ), 582 ( 0x0B4B, 0x0B4D ), 583 ( 0x0B56, 0x0B57 ), 584 ( 0x0B82, 0x0B83 ), 585 ( 0x0BBE, 0x0BC2 ), 586 ( 0x0BC6, 0x0BC8 ), 587 ( 0x0BCA, 0x0BCD ), 588 0x0BD7, 589 ( 0x0C01, 0x0C03 ), 590 ( 0x0C3E, 0x0C44 ), 591 ( 0x0C46, 0x0C48 ), 592 ( 0x0C4A, 0x0C4D ), 593 ( 0x0C55, 0x0C56 ), 594 ( 0x0C82, 0x0C83 ), 595 ( 0x0CBE, 0x0CC4 ), 596 ( 0x0CC6, 0x0CC8 ), 597 ( 0x0CCA, 0x0CCD ), 598 ( 0x0CD5, 0x0CD6 ), 599 ( 0x0D02, 0x0D03 ), 600 ( 0x0D3E, 0x0D43 ), 601 ( 0x0D46, 0x0D48 ), 602 ( 0x0D4A, 0x0D4D ), 603 0x0D57, 604 0x0E31, 605 ( 0x0E34, 0x0E3A ), 606 ( 0x0E47, 0x0E4E ), 607 0x0EB1, 608 ( 0x0EB4, 0x0EB9 ), 609 ( 0x0EBB, 0x0EBC ), 610 ( 0x0EC8, 0x0ECD ), 611 ( 0x0F18, 0x0F19 ), 612 0x0F35, 613 0x0F37, 614 0x0F39, 615 0x0F3E, 616 0x0F3F, 617 ( 0x0F71, 0x0F84 ), 618 ( 0x0F86, 0x0F8B ), 619 ( 0x0F90, 0x0F95 ), 620 0x0F97, 621 ( 0x0F99, 0x0FAD ), 622 ( 0x0FB1, 0x0FB7 ), 623 0x0FB9, 624 ( 0x20D0, 0x20DC ), 625 0x20E1, 626 ( 0x302A, 0x302F ), 627 0x3099, 628 0x309A 629 ) 630 631 Digit = CodePointSet( 632 ( 0x0030, 0x0039 ), 633 ( 0x0660, 0x0669 ), 634 ( 0x06F0, 0x06F9 ), 635 ( 0x0966, 0x096F ), 636 ( 0x09E6, 0x09EF ), 637 ( 0x0A66, 0x0A6F ), 638 ( 0x0AE6, 0x0AEF ), 639 ( 0x0B66, 0x0B6F ), 640 ( 0x0BE7, 0x0BEF ), 641 ( 0x0C66, 0x0C6F ), 642 ( 0x0CE6, 0x0CEF ), 643 ( 0x0D66, 0x0D6F ), 644 ( 0x0E50, 0x0E59 ), 645 ( 0x0ED0, 0x0ED9 ), 646 ( 0x0F20, 0x0F29 ) 647 ) 648 649 Extender = CodePointSet( 650 0x00B7, 651 0x02D0, 652 0x02D1, 653 0x0387, 654 0x0640, 655 0x0E46, 656 0x0EC6, 657 0x3005, 658 ( 0x3031, 0x3035 ), 659 ( 0x309D, 0x309E ), 660 ( 0x30FC, 0x30FE ) 661 ) 662 663 # Not an explicit production, but used in Name production 664 NameStartChar = CodePointSet(Letter) 665 NameStartChar.add(ord('_')) 666 NameStartChar.add(ord(':')) 667 668 NCNameStartChar = CodePointSet(Letter) 669 NCNameStartChar.add(ord('_')) 670 671 NameChar = CodePointSet(Letter) 672 NameChar.extend(Digit) 673 NameChar.add(ord('.')) 674 NameChar.add(ord('-')) 675 NameChar.add(ord('_')) 676 NameChar.add(ord(':')) 677 NameChar.extend(CombiningChar) 678 NameChar.extend(Extender) 679 680 NCNameChar = CodePointSet(Letter) 681 NCNameChar.extend(Digit) 682 NCNameChar.add(ord('.')) 683 NCNameChar.add(ord('-')) 684 NCNameChar.add(ord('_')) 685 NCNameChar.extend(CombiningChar) 686 NCNameChar.extend(Extender) 687 688 Name_pat = '%s%s*' % (NameStartChar.asPattern(), NameChar.asPattern()) 689 Name_re = re.compile('^%s$' % (Name_pat,)) 690 NmToken_pat = '%s+' % (NameChar.asPattern(),) 691 NmToken_re = re.compile('^%s$' % (NmToken_pat,)) 692 NCName_pat = '%s%s*' % (NCNameStartChar.asPattern(), NCNameChar.asPattern()) 693 NCName_re = re.compile('^%s$' % (NCName_pat,)) 694 QName_pat = '(%s:)?%s' % (NCName_pat, NCName_pat) 695 QName_re = re.compile('^%s$' % (QName_pat,))
696 697 # Production 24 : Single Character Escapes 698 SingleCharEsc = { 'n' : CodePointSet(0x0A), 699 'r' : CodePointSet(0x0D), 700 't' : CodePointSet(0x09) } 701 for c in r'\|.-^?*+{}()[]': 702 SingleCharEsc[c] = CodePointSet(ord(c)) 703 704 # Production 25 : Category Escapes 705 # Production 26: Complemented Category Escapes 706 catEsc = { } 707 complEsc = { } 708 for k, v in six.iteritems(PropertyMap): 709 catEsc[six.u('p{%s}') % (k,)] = v 710 catEsc[six.u('P{%s}') % (k,)] = v.negate() 711 712 # Production 36 : IsBlock escapes 713 IsBlockEsc = { } 714 for k, v in six.iteritems(BlockMap): 715 IsBlockEsc[six.u('p{Is%s}') % (k,)] = v 716 IsBlockEsc[six.u('P{Is%s}') % (k,)] = v.negate() 717 718 # Production 37 : Multi-Character Escapes 719 WildcardEsc = CodePointSet(ord('\n'), ord('\r')).negate() 720 MultiCharEsc = { } 721 MultiCharEsc['s'] = CodePointSet(0x20, ord('\t'), ord('\n'), ord('\r')) 722 MultiCharEsc['S'] = MultiCharEsc['s'].negate() 723 MultiCharEsc['i'] = CodePointSet(XML1p0e2.Letter).add(ord('_')).add(ord(':')) 724 MultiCharEsc['I'] = MultiCharEsc['i'].negate() 725 MultiCharEsc['c'] = CodePointSet(XML1p0e2.NameChar) 726 MultiCharEsc['C'] = MultiCharEsc['c'].negate() 727 MultiCharEsc['d'] = PropertyMap['Nd'] 728 MultiCharEsc['D'] = MultiCharEsc['d'].negate() 729 MultiCharEsc['W'] = CodePointSet(PropertyMap['P']).extend(PropertyMap['Z']).extend(PropertyMap['C']) 730 MultiCharEsc['w'] = MultiCharEsc['W'].negate() 731